Date: (Fri) Nov 20, 2015
Data: Source: Training: https://www.kaggle.com/c/15-071x-the-analytics-edge-competition-spring-2015/download/NYTimesBlogTrain.csv
New: https://www.kaggle.com/c/15-071x-the-analytics-edge-competition-spring-2015/download/NYTimesBlogTest.csv
Time period:
Based on analysis utilizing <> techniques,
Summary of key steps & error improvement stats:
Use plot.ly for interactive plots ?
varImp for randomForest crashes in caret version:6.0.41 -> submit bug report
extensions toward multiclass classification are scheduled for the next release
glm_dmy_mdl should use the same method as glm_sel_mdl until custom dummy classifer is implemented
rm(list = ls())
set.seed(12345)
options(stringsAsFactors = FALSE)
source("~/Dropbox/datascience/R/myscript.R")
source("~/Dropbox/datascience/R/mydsutils.R")
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
source("~/Dropbox/datascience/R/myplot.R")
source("~/Dropbox/datascience/R/mypetrinet.R")
source("~/Dropbox/datascience/R/myplclust.R")
source("~/Dropbox/datascience/R/mytm.R")
# Gather all package requirements here
suppressPackageStartupMessages(require(doMC))
registerDoMC(6) # # of cores on machine - 2
suppressPackageStartupMessages(require(caret))
#source("dbgcaret.R")
#packageVersion("snow")
#require(sos); findFn("cosine", maxPages=2, sortby="MaxScore")
# Analysis control global variables
# Inputs
glb_trnng_url <- "https://www.kaggle.com/c/15-071x-the-analytics-edge-competition-spring-2015/download/NYTimesBlogTrain.csv"
glb_newdt_url <- "https://www.kaggle.com/c/15-071x-the-analytics-edge-competition-spring-2015/download/NYTimesBlogTest.csv"
glbInpMerge <- NULL #: default
# list(fnames = c("<fname1>", "<fname2>")) # files will be concatenated
glb_is_separate_newobs_dataset <- TRUE # or TRUE
glb_split_entity_newobs_datasets <- FALSE # select from c(FALSE, TRUE)
glb_split_newdata_method <- NULL # select from c(NULL, "condition", "sample", "copy")
glb_split_newdata_condition <- NULL # or "is.na(<var>)"; "<var> <condition_operator> <value>"
glb_split_newdata_size_ratio <- 0.3 # > 0 & < 1
glb_split_sample.seed <- 123 # or any integer
glbObsDropCondition <- NULL # : default
# "<condition>" # use | & ; NOT || &&
#parse(text=glbObsDropCondition)
#subset(glbObsAll, .grpid %in% c(31))
glb_obs_repartition_train_condition <- NULL # : default
# "<condition>"
glb_max_fitobs <- NULL # or any integer
glb_is_regression <- FALSE; glb_is_classification <- !glb_is_regression;
glb_is_binomial <- TRUE # or TRUE or FALSE
glb_rsp_var_raw <- "Popular"
# for classification, the response variable has to be a factor
glb_rsp_var <- "Popular.fctr" # glb_rsp_var_raw # or "Popular.fctr"
# if the response factor is based on numbers/logicals e.g (0/1 OR TRUE/FALSE vs. "A"/"B"),
# or contains spaces (e.g. "Not in Labor Force")
# caret predict(..., type="prob") crashes
glb_map_rsp_raw_to_var <- #NULL
function(raw) {
# return(raw ^ 0.5)
# return(log(1 + raw))
# return(log10(raw))
# return(exp(-raw / 2))
ret_vals <- rep_len(NA, length(raw)); ret_vals[!is.na(raw)] <- ifelse(raw[!is.na(raw)] == 1, "Y", "N"); return(relevel(as.factor(ret_vals), ref="N"))
# #as.factor(paste0("B", raw))
# #as.factor(gsub(" ", "\\.", raw))
}
# glb_map_rsp_raw_to_var(tst <- c(NA, 0, 1))
# glb_map_rsp_raw_to_var(tst <- c(NA, 0, 2.99, 280.50, 1000.00))
glb_map_rsp_var_to_raw <- #NULL
function(var) {
# return(var ^ 2.0)
# return(exp(var))
# return(10 ^ var)
# return(-log(var) * 2)
as.numeric(var) - 1
# gsub("\\.", " ", levels(var)[as.numeric(var)])
# c("<=50K", " >50K")[as.numeric(var)]
# c(FALSE, TRUE)[as.numeric(var)]
}
# glb_map_rsp_var_to_raw(glb_map_rsp_raw_to_var(tst))
if ((glb_rsp_var != glb_rsp_var_raw) && is.null(glb_map_rsp_raw_to_var))
stop("glb_map_rsp_raw_to_var function expected")
# List info gathered for various columns
# <col_name>: <description>; <notes>
# NewsDesk = the New York Times desk that produced the story (Business, Culture, Foreign, etc.)
# SectionName = the section the article appeared in (Opinion, Arts, Technology, etc.)
# SubsectionName = the subsection the article appeared in (Education, Small Business, Room for Debate, etc.)
# Headline = the title of the article
# Snippet = a small portion of the article text
# Abstract = a summary of the blog article, written by the New York Times
# WordCount = the number of words in the article
# PubDate = the publication date, in the format "Year-Month-Day Hour:Minute:Second"
# UniqueID = a unique identifier for each article
# If multiple vars are parts of id, consider concatenating them to create one id var
# If glb_id_var == NULL, ".rownames <- row.names()" is the default
# User-specified exclusions
glbFeatsExclude <- c(NULL
# Feats that shd be excluded due to known causation by prediction variable
# , "<feat1", "<feat2>"
# Feats that are linear combinations (alias in glm)
# Feature-engineering phase -> start by excluding all features except id & category & work each one in
, "NewsDesk", "SectionName", "SubsectionName"
, "WordCount", "PubDate"
# Feature Engineering done with prior features
, "Headline", "Snippet", "Abstract"
)
if (glb_rsp_var_raw != glb_rsp_var)
glbFeatsExclude <- union(glbFeatsExclude, glb_rsp_var_raw)
glbFeatsInteractionOnly <- list()
#glbFeatsInteractionOnly[["carrier.fctr"]] <- "cellular.fctr"
# currently does not handle more than 1 column; consider concatenating multiple columns
glb_id_var <- "UniqueID" # choose from c(NULL : default, "<id_feat>")
glbFeatsCategory <- "NDSSName.my.fctr" # choose from c(NULL : default, "<category>")
glb_drop_vars <- c(NULL
# , "<feat1>", "<feat2>"
)
glb_map_vars <- NULL # or c("<var1>", "<var2>")
glb_map_urls <- list();
# glb_map_urls[["<var1>"]] <- "<var1.url>"
glb_assign_pairs_lst <- NULL;
# glb_assign_pairs_lst[["<var1>"]] <- list(from=c(NA),
# to=c("NA.my"))
glb_assign_vars <- names(glb_assign_pairs_lst)
# Derived features; Use this mechanism to cleanse data ??? Cons: Data duplication ???
glbFeatsDerive <- list();
# glbFeatsDerive[["<feat.my.sfx>"]] <- list(
# mapfn = function(<arg1>, <arg2>) { return(function(<arg1>, <arg2>)) }
# , args = c("<arg1>", "<arg2>"))
# character
# mapfn = function(Week) { return(substr(Week, 1, 10)) }
# mapfn = function(descriptor) { return(plyr::revalue(descriptor, c(
# "ABANDONED BUILDING" = "OTHER",
# "**" = "**"
# ))) }
glbFeatsDerive[["NDSSName.my"]] <- list(
mapfn = function(NewsDesk, SectionName, SubsectionName) {
descriptor <-
gsub(" ", "", paste(NewsDesk, SectionName, SubsectionName, sep = "#"))
return(plyr::revalue(descriptor, c(NULL
, "#BusinessDay#Dealbook" = "Business#BusinessDay#Dealbook"
, "#BusinessDay#SmallBusiness" = "Business#BusinessDay#SmallBusiness"
, "#Crosswords/Games#" = "Business#Crosswords/Games#"
, "#Open#" = "Business#Technology#"
, "#Technology#" = "Business#Technology#"
, "Business##" = "Business#Technology#"
, "#Arts#" = "Culture#Arts#"
, "Foreign##" = "Foreign#World#"
, "#World#AsiaPacific" = "Foreign#World#AsiaPacific"
, "#N.Y./Region#" = "Metro#N.Y./Region#"
, "#Opinion#" = "OpEd#Opinion#"
, "OpEd##" = "OpEd#Opinion#"
, "#Health#" = "Science#Health#"
, "Science##" = "Science#Health#"
, "Styles#Health#" = "Science#Health#"
, "Styles##" = "Styles##Fashion"
, "Styles#Style#Fashion&Style" = "Styles##Fashion"
, "#Travel#" = "Travel#Travel#"
, "Magazine#Magazine#" = "myOther"
, "National##" = "myOther"
, "National#U.S.#Politics" = "myOther"
, "Sports##" = "myOther"
, "Sports#Sports#" = "myOther"
, "#U.S.#" = "myOther"
)))
}
, args = c("NewsDesk", "SectionName", "SubsectionName"))
# mapfn = function(description) { mod_raw <- description;
# This is here because it does not work if it's in txt_map_filename
# mod_raw <- gsub(paste0(c("\n", "\211", "\235", "\317", "\333"), collapse = "|"), " ", mod_raw)
# Don't parse for "." because of ".com"; use customized gsub for that text
# mod_raw <- gsub("(\\w)(!|\\*|,|-|/)(\\w)", "\\1\\2 \\3", mod_raw);
# return(mod_raw) }
#print(mod_raw <- grep(""", glbObsAll[, txt_var], value = TRUE))
#print(mod_raw <- glbObsAll[c(88,187,280,1040,1098), txt_var])
#print(mod_raw <- glbObsAll[sel_obs(list(descr.my.contains="\\bdoes( +)not\\b")), glbFeatsText])
#print(mod_raw <- glbObsAll[sel_obs(list(descr.my.contains="\\bipad [[:digit:]]\\b")), glbFeatsText][01:10])
#print(mod_raw <- glbObsAll[sel_obs(list(descr.my.contains="pad mini")), glbFeatsText][11:20])
#print(mod_raw <- glbObsAll[sel_obs(list(descr.my.contains="pad mini")), glbFeatsText][21:30])
#print(mod_raw <- glbObsAll[sel_obs(list(descr.my.contains="pad mini")), glbFeatsText][31:40])
#glbObsAll[which(glb_post_stop_words_terms_mtrx_lst[[txt_var]][, subset(glb_post_stop_words_terms_df_lst[[txt_var]], term %in% c("conditionminimal"))$pos] > 0), "description"]
# numeric
# Create feature based on record position/id in data
# glbFeatsDerive[["dummy.my"]] <- list(
# mapfn = function(UniqueID) { return(UniqueID) }
# , args = c("UniqueID"))
# Add logs of numerics that are not distributed normally
# Derive & keep multiple transformations of the same feature, if normality is hard to achieve with just one transformation
# Right skew: logp1; sqrt; ^ 1/3; logp1(logp1); log10; exp(-<feat>/constant)
glbFeatsDerive[["WordCount.log1p"]] <- list(
mapfn = function(WordCount) { return(log1p(WordCount)) }
, args = c("WordCount"))
glbFeatsDerive[["WordCount.root2"]] <- list(
mapfn = function(WordCount) { return(WordCount ^ (1/2)) }
, args = c("WordCount"))
glbFeatsDerive[["WordCount.nexp"]] <- list(
mapfn = function(WordCount) { return(exp(-WordCount)) }
, args = c("WordCount"))
#print(summary(glbObsAll$WordCount))
#print(summary(mapfn(glbObsAll$WordCount)))
# mapfn = function(Rasmussen) { return(ifelse(sign(Rasmussen) >= 0, 1, 0)) }
# mapfn = function(startprice) { return(startprice ^ (1/2)) }
# mapfn = function(startprice) { return(log(startprice)) }
# mapfn = function(startprice) { return(exp(-startprice / 20)) }
# mapfn = function(startprice) { return(scale(log(startprice))) }
# mapfn = function(startprice) { return(sign(sprice.predict.diff) * (abs(sprice.predict.diff) ^ (1/10))) }
# factor
# mapfn = function(PropR) { return(as.factor(ifelse(PropR >= 0.5, "Y", "N"))) }
# mapfn = function(productline, description) { as.factor(gsub(" ", "", productline)) }
# mapfn = function(purpose) { return(relevel(as.factor(purpose), ref="all_other")) }
# mapfn = function(raw) { tfr_raw <- as.character(cut(raw, 5));
# tfr_raw[is.na(tfr_raw)] <- "NA.my";
# return(as.factor(tfr_raw)) }
# mapfn = function(startprice.log10) { return(cut(startprice.log10, 3)) }
# mapfn = function(startprice.log10) { return(cut(sprice.predict.diff, c(-1000, -100, -10, -1, 0, 1, 10, 100, 1000))) }
# , args = c("<arg1>"))
# multiple args
# mapfn = function(PTS, oppPTS) { return(PTS - oppPTS) }
# mapfn = function(startprice.log10.predict, startprice) {
# return(spdiff <- (10 ^ startprice.log10.predict) - startprice) }
# mapfn = function(productline, description) { as.factor(
# paste(gsub(" ", "", productline), as.numeric(nchar(description) > 0), sep = "*")) }
# # If glbObsAll is not sorted in the desired manner
# mapfn=function(Week) { return(coredata(lag(zoo(orderBy(~Week, glbObsAll)$ILI), -2, na.pad=TRUE))) }
# mapfn=function(ILI) { return(coredata(lag(zoo(ILI), -2, na.pad=TRUE))) }
# mapfn=function(ILI.2.lag) { return(log(ILI.2.lag)) }
# glbFeatsDerive[["<var1>"]] <- glbFeatsDerive[["<var2>"]]
glb_derive_vars <- names(glbFeatsDerive)
# tst <- "descr.my"; args_lst <- NULL; for (arg in glbFeatsDerive[[tst]]$args) args_lst[[arg]] <- glbObsAll[, arg]; print(head(args_lst[[arg]])); print(head(drv_vals <- do.call(glbFeatsDerive[[tst]]$mapfn, args_lst)));
# print(which_ix <- which(args_lst[[arg]] == 0.75)); print(drv_vals[which_ix]);
glbFeatsDateTime <- list()
glbFeatsDateTime[["PubDate"]] <-
c(format = "%Y-%m-%d %H:%M:%S", timezone = "America/New_York", impute.na = FALSE)
glbFeatsPrice <- NULL # or c("<price_var>")
glbFeatsText <- NULL # c("<txt_var>") # NULL #
Sys.setlocale("LC_ALL", "C") # For english
## [1] "C/C/C/C/C/en_US.UTF-8"
# Text Processing Step: custom modifications not present in txt_munge -> use glbFeatsDerive
# Text Processing Step: universal modifications
glb_txt_munge_filenames_pfx <- "NYTBlogs3_mytxt_"
# Text Processing Step: tolower
# Text Processing Step: myreplacePunctuation
# Text Processing Step: removeWords
glb_txt_stop_words <- list()
# Remember to use unstemmed words
if (!is.null(glbFeatsText)) {
require(tm)
glb_txt_stop_words[["<txt_var>"]] <- sort(c(NULL
# Remove any words from stopwords
# , setdiff(myreplacePunctuation(stopwords("english")), c("<keep_wrd1>", <keep_wrd2>"))
# cor.y.train == NA
# ,unlist(strsplit(paste(c(NULL
# ,"<comma-separated-terms>"
# ), collapse=",")
# freq == 1; keep c("<comma-separated-terms-to-keep>")
# ,<comma-separated-terms>
# chisq.pval high (e.g. == 1); keep c("<comma-separated-terms-to-keep>")
# ,<comma-separated-terms>
# nzv.freqRatio high (e.g. >= glb_nzv_freqCut); keep c("<comma-separated-terms-to-keep>")
# ,<comma-separated-terms>
))
}
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txt_var]][grep("^2", glb_post_stem_words_terms_df_lst[[txt_var]]$term), ])
#glbObsAll[glb_post_stem_words_terms_mtrx_lst[[txt_var]][, 6] > 0, glbFeatsText]
# To identify terms with a specific freq
#paste0(sort(subset(glb_post_stop_words_terms_df_lst[[txt_var]], freq == 1)$term), collapse = ",")
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txt_var]], freq <= 2)$term), collapse = ",")
# To identify terms with a specific freq &
# are not stemmed together later OR is value of color.fctr (e.g. gold)
#paste0(sort(subset(glb_post_stop_words_terms_df_lst[[txt_var]], (freq == 1) & !(term %in% c("blacked","blemish","blocked","blocks","buying","cables","careful","carefully","changed","changing","chargers","cleanly","cleared","connect","connects","connected","contains","cosmetics","default","defaulting","defective","definitely","describe","described","devices","displays","drop","drops","engravement","excellant","excellently","feels","fix","flawlessly","frame","framing","gentle","gold","guarantee","guarantees","handled","handling","having","install","iphone","iphones","keeped","keeps","known","lights","line","lining","liquid","liquidation","looking","lots","manuals","manufacture","minis","most","mostly","network","networks","noted","opening","operated","performance","performs","person","personalized","photograph","physically","placed","places","powering","pre","previously","products","protection","purchasing","returned","rotate","rotation","running","sales","second","seconds","shipped","shuts","sides","skin","skinned","sticker","storing","thats","theres","touching","unusable","update","updates","upgrade","weeks","wrapped","verified","verify") ))$term), collapse = ",")
#print(subset(glb_post_stem_words_terms_df_lst[[txt_var]], (freq <= 2)))
#glbObsAll[which(terms_mtrx[, 229] > 0), glbFeatsText]
# To identify terms with cor.y == NA
#orderBy(~-freq+term, subset(glb_post_stop_words_terms_df_lst[[txt_var]], is.na(cor.y)))
#paste(sort(subset(glb_post_stop_words_terms_df_lst[[txt_var]], is.na(cor.y))[, "term"]), collapse=",")
#orderBy(~-freq+term, subset(glb_post_stem_words_terms_df_lst[[txt_var]], is.na(cor.y)))
# To identify terms with low cor.y.abs
#head(orderBy(~cor.y.abs+freq+term, subset(glb_post_stem_words_terms_df_lst[[txt_var]], !is.na(cor.y))), 5)
# To identify terms with high chisq.pval
#subset(glb_post_stem_words_terms_df_lst[[txt_var]], chisq.pval > 0.99)
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txt_var]], (chisq.pval > 0.99) & (freq <= 10))$term), collapse=",")
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txt_var]], (chisq.pval > 0.9))$term), collapse=",")
#head(orderBy(~-chisq.pval+freq+term, glb_post_stem_words_terms_df_lst[[txt_var]]), 5)
#glbObsAll[glb_post_stem_words_terms_mtrx_lst[[txt_var]][, 68] > 0, glbFeatsText]
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txt_var]][grep("^m", glb_post_stem_words_terms_df_lst[[txt_var]]$term), ])
# To identify terms with high nzv.freqRatio
#summary(glb_post_stem_words_terms_df_lst[[txt_var]]$nzv.freqRatio)
#paste0(sort(setdiff(subset(glb_post_stem_words_terms_df_lst[[txt_var]], (nzv.freqRatio >= glb_nzv_freqCut) & (freq < 10) & (chisq.pval >= 0.05))$term, c( "128gb","3g","4g","gold","ipad1","ipad3","ipad4","ipadair2","ipadmini2","manufactur","spacegray","sprint","tmobil","verizon","wifion"))), collapse=",")
# To identify obs with a txt term
#tail(orderBy(~-freq+term, glb_post_stop_words_terms_df_lst[[txt_var]]), 20)
#mydspObs(list(descr.my.contains="non"), cols=c("color", "carrier", "cellular", "storage"))
#grep("ever", dimnames(terms_stop_mtrx)$Terms)
#which(terms_stop_mtrx[, grep("ipad", dimnames(terms_stop_mtrx)$Terms)] > 0)
#glbObsAll[which(terms_stop_mtrx[, grep("16", dimnames(terms_stop_mtrx)$Terms)[1]] > 0), c(glbFeatsCategory, "storage", txt_var)]
# To identify whether terms shd be synonyms
#orderBy(~term, glb_post_stop_words_terms_df_lst[[txt_var]][grep("^moder", glb_post_stop_words_terms_df_lst[[txt_var]]$term), ])
# term_row_df <- glb_post_stop_words_terms_df_lst[[txt_var]][grep("^came$", glb_post_stop_words_terms_df_lst[[txt_var]]$term), ]
#
# cor(glb_post_stop_words_terms_mtrx_lst[[txt_var]][glbObsAll$.lcn == "Fit", term_row_df$pos], glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
# To identify which stopped words are "close" to a txt term
#sort(cluster_vars)
# Text Processing Step: stemDocument
# To identify stemmed txt terms
#glb_post_stop_words_terms_df_lst[[txt_var]][grep("condit", glb_post_stop_words_terms_df_lst[[txt_var]]$term), ]
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txt_var]][grep("^con", glb_post_stem_words_terms_df_lst[[txt_var]]$term), ])
#glbObsAll[which(terms_stem_mtrx[, grep("use", dimnames(terms_stem_mtrx)$Terms)[[1]]] > 0), c(glb_id_var, "productline", txt_var)]
#glbObsAll[which(TfIdf_stem_mtrx[, 191] > 0), c(glb_id_var, glbFeatsCategory, txt_var)]
#which(glbObsAll$UniqueID %in% c(11915, 11926, 12198))
# Text Processing Step: mycombineSynonyms
# To identify which terms are associated with not -> combine "could not" & "couldn't"
#findAssocs(glb_full_DTM_lst[[txt_var]], "not", 0.05)
# To identify which synonyms should be combined
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txt_var]][grep("^c", glb_post_stem_words_terms_df_lst[[txt_var]]$term), ])
chk_comb_cor <- function(syn_lst) {
# cor(terms_stem_mtrx[glbObsAll$.src == "Train", grep("^(damag|dent|ding)$", dimnames(terms_stem_mtrx)[[2]])], glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
print(subset(glb_post_stem_words_terms_df_lst[[txt_var]], term %in% syn_lst$syns))
print(subset(get_corpus_terms(tm_map(glb_txt_corpus_lst[[txt_var]], mycombineSynonyms, list(syn_lst), lazy=FALSE)), term == syn_lst$word))
# cor(terms_stop_mtrx[glbObsAll$.src == "Train", grep("^(damage|dent|ding)$", dimnames(terms_stop_mtrx)[[2]])], glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
# cor(rowSums(terms_stop_mtrx[glbObsAll$.src == "Train", grep("^(damage|dent|ding)$", dimnames(terms_stop_mtrx)[[2]])]), glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
}
#chk_comb_cor(syn_lst=list(word="cabl", syns=c("cabl", "cord")))
#chk_comb_cor(syn_lst=list(word="damag", syns=c("damag", "dent", "ding")))
#chk_comb_cor(syn_lst=list(word="dent", syns=c("dent", "ding")))
#chk_comb_cor(syn_lst=list(word="use", syns=c("use", "usag")))
glb_txt_synonyms <- list()
#glb_txt_synonyms[["<txt_var>"]] <- list(NULL
# , list(word="<stem1>", syns=c("<stem1>", "<stem1_2>"))
# )
# options include: "weightTf", "myweightTflog1p", "myweightTfsqrt", "weightTfIdf", "weightBM25"
glb_txt_terms_control <- list(weighting = "weightTfIdf" # : default
# termFreq selection criteria across obs: tm default: list(global=c(1, Inf))
, bounds = list(global = c(1, Inf))
# wordLengths selection criteria: tm default: c(3, Inf)
, wordLengths = c(1, Inf)
)
glb_txt_cor_var <- glb_rsp_var # : default # or c(<feat>)
# select one from c("union.top.val.cor", "top.cor", "top.val", default: "top.chisq", "sparse")
glbFeatsTextFilter <- "top.chisq"
glbFeatsTextTermsMax <- rep(10, length(glbFeatsText)) # :default
names(glbFeatsTextTermsMax) <- glbFeatsText
# Text Processing Step: extractAssoc
glbFeatsTextAssocCor <- rep(1, length(glbFeatsText)) # :default
names(glbFeatsTextAssocCor) <- glbFeatsText
# Remember to use stemmed terms
glb_important_terms <- list()
# Text Processing Step: extractPatterns (ngrams)
glbFeatsTextPatterns <- list()
#glbFeatsTextPatterns[[<txt_var>>]] <- list()
#glbFeatsTextPatterns[[<txt_var>>]] <- c(metropolitan.diary.colon = "Metropolitan Diary:")
# Have to set it even if it is not used
# Properties:
# numrows(glb_feats_df) << numrows(glbObsFit
# Select terms that appear in at least 0.2 * O(FP/FN(glbObsOOB)) ???
# numrows(glbObsOOB) = 1.1 * numrows(glbObsNew) ???
glb_sprs_thresholds <- NULL # or c(<txt_var1> = 0.988, <txt_var2> = 0.970, <txt_var3> = 0.970)
glbFctrMaxUniqVals <- 21 # default: 20
glb_impute_na_data <- TRUE # FALSE # or TRUE
glb_mice_complete.seed <- 144 # or any integer
glb_cluster <- FALSE # : default or TRUE
glb_cluster.seed <- 189 # or any integer
glb_cluster_entropy_var <- glb_rsp_var # c(glb_rsp_var, as.factor(cut(glb_rsp_var, 3)), default: NULL)
glbFeatsTextClusterVarsExclude <- FALSE # default FALSE
glb_interaction_only_feats <- NULL # : default or c(<parent_feat> = "<child_feat>")
glb_nzv_freqCut <- 19 # 19 : caret default
glb_nzv_uniqueCut <- 10 # 10 : caret default
glbRFESizes <- list()
#glbRFESizes[["mdlFamily"]] <- c(4, 8, 16, 32, 64, 67, 68, 69) # Accuracy@69/70 = 0.8258
glbObsFitOutliers <- list()
# If outliers.n >= 10; consider concatenation of interaction vars
# glbObsFitOutliers[["<mdlFamily>"]] <- c(NULL
# is.na(.rstudent)
# is.na(.dffits)
# .hatvalues >= 0.99
# -38,167,642 < minmax(.rstudent) < 49,649,823
# , <comma-separated-<glb_id_var>>
# )
glbObsTrnOutliers <- list()
# influence.measures: car::outlier; rstudent; dffits; hatvalues; dfbeta; dfbetas
#mdlId <- "RFE.X.glm"; obs_df <- fitobs_df
#mdlId <- "Final.glm"; obs_df <- trnobs_df
#mdlId <- "CSM2.X.glm"; obs_df <- fitobs_df
#print(outliers <- car::outlierTest(glb_models_lst[[mdlId]]$finalModel))
#mdlIdFamily <- paste0(head(unlist(str_split(mdlId, "\\.")), -1), collapse="."); obs_df <- dplyr::filter_(obs_df, interp(~(!(var %in% glbObsFitOutliers[[mdlIdFamily]])), var = as.name(glb_id_var))); model_diags_df <- cbind(obs_df, data.frame(.rstudent=stats::rstudent(glb_models_lst[[mdlId]]$finalModel)), data.frame(.dffits=stats::dffits(glb_models_lst[[mdlId]]$finalModel)), data.frame(.hatvalues=stats::hatvalues(glb_models_lst[[mdlId]]$finalModel)));print(summary(model_diags_df[, c(".rstudent",".dffits",".hatvalues")])); table(cut(model_diags_df$.hatvalues, breaks=c(0.00, 0.98, 0.99, 1.00)))
#print(subset(model_diags_df, is.na(.rstudent))[, glb_id_var])
#print(subset(model_diags_df, is.na(.dffits))[, glb_id_var])
#print(model_diags_df[which.min(model_diags_df$.dffits), ])
#print(subset(model_diags_df, .hatvalues > 0.99)[, glb_id_var])
#dffits_df <- merge(dffits_df, outliers_df, by="row.names", all.x=TRUE); row.names(dffits_df) <- dffits_df$Row.names; dffits_df <- subset(dffits_df, select=-Row.names)
#dffits_df <- merge(dffits_df, glbObsFit, by="row.names", all.x=TRUE); row.names(dffits_df) <- dffits_df$Row.names; dffits_df <- subset(dffits_df, select=-Row.names)
#subset(dffits_df, !is.na(.Bonf.p))
#mdlId <- "CSM.X.glm"; vars <- myextract_actual_feats(row.names(orderBy(reformulate(c("-", paste0(mdlId, ".imp"))), myget_feats_imp(glb_models_lst[[mdlId]]))));
#model_diags_df <- glb_get_predictions(model_diags_df, mdlId, glb_rsp_var)
#obs_ix <- row.names(model_diags_df) %in% names(outliers$rstudent)[1]
#obs_ix <- which(is.na(model_diags_df$.rstudent))
#obs_ix <- which(is.na(model_diags_df$.dffits))
#myplot_parcoord(obs_df=model_diags_df[, c(glb_id_var, glbFeatsCategory, ".rstudent", ".dffits", ".hatvalues", glb_rsp_var, paste0(glb_rsp_var, mdlId), vars[1:min(20, length(vars))])], obs_ix=obs_ix, id_var=glb_id_var, category_var=glbFeatsCategory)
#model_diags_df[row.names(model_diags_df) %in% names(outliers$rstudent)[c(1:2)], ]
#ctgry_diags_df <- model_diags_df[model_diags_df[, glbFeatsCategory] %in% c("Unknown#0"), ]
#myplot_parcoord(obs_df=ctgry_diags_df[, c(glb_id_var, glbFeatsCategory, ".rstudent", ".dffits", ".hatvalues", glb_rsp_var, "startprice.log10.predict.RFE.X.glmnet", indep_vars[1:20])], obs_ix=row.names(ctgry_diags_df) %in% names(outliers$rstudent)[1], id_var=glb_id_var, category_var=glbFeatsCategory)
#table(glbObsFit[model_diags_df[, glbFeatsCategory] %in% c("iPad1#1"), "startprice.log10.cut.fctr"])
#glbObsFit[model_diags_df[, glbFeatsCategory] %in% c("iPad1#1"), c(glb_id_var, "startprice")]
# No outliers & .dffits == NaN
#myplot_parcoord(obs_df=model_diags_df[, c(glb_id_var, glbFeatsCategory, glb_rsp_var, "startprice.log10.predict.RFE.X.glmnet", indep_vars[1:10])], obs_ix=seq(1:nrow(model_diags_df))[is.na(model_diags_df$.dffits)], id_var=glb_id_var, category_var=glbFeatsCategory)
# Modify mdlId to (build & extract) "<FamilyId>#<Fit|Trn>#<caretMethod>#<preProc1.preProc2>#<samplingMethod>"
glb_models_lst <- list(); glb_models_df <- data.frame()
# Regression
if (glb_is_regression) {
glbMdlMethods <- c(NULL
# deterministic
#, "lm", # same as glm
, "glm", "bayesglm", "glmnet"
, "rpart"
# non-deterministic
, "gbm", "rf"
# Unknown
, "nnet" , "avNNet" # runs 25 models per cv sample for tunelength=5
, "svmLinear", "svmLinear2"
, "svmPoly" # runs 75 models per cv sample for tunelength=5
, "svmRadial"
, "earth"
, "bagEarth" # Takes a long time
)
} else
# Classification - Add ada (auto feature selection)
if (glb_is_binomial)
glbMdlMethods <- c(NULL
# deterministic
, "bagEarth" # Takes a long time
, "glm", "bayesglm", "glmnet"
, "nnet"
, "rpart"
# non-deterministic
, "gbm"
, "avNNet" # runs 25 models per cv sample for tunelength=5
, "rf"
# Unknown
, "lda", "lda2"
# svm models crash when predict is called -> internal to kernlab it should call predict without .outcome
, "svmLinear", "svmLinear2"
, "svmPoly" # runs 75 models per cv sample for tunelength=5
, "svmRadial"
, "earth"
) else
glbMdlMethods <- c(NULL
# non-deterministic
, "rf"
# Unknown
, "gbm", "rpart"
)
glb_mdl_family_lst <- list(); glb_mdl_feats_lst <- list()
# family: Choose from c("RFE.X", "CSM.X", "All.X", "Best.Interact")
# methods: Choose from c(NULL, <method>, glbMdlMethods)
#glb_mdl_family_lst[["RFE.X"]] <- c("glmnet", "glm") # non-NULL list is mandatory
glb_mdl_family_lst[["All.X"]] <- "glmnet" # non-NULL list is mandatory
#glb_mdl_family_lst[["Best.Interact"]] <- "glmnet" # non-NULL list is mandatory
# Check if interaction features make RFE better
# glb_mdl_family_lst[["CSM.X"]] <- setdiff(glbMdlMethods, c("lda", "lda2")) # crashing due to category:.clusterid ??? #c("glmnet", "glm") # non-NULL list is mandatory
# glb_mdl_feats_lst[["CSM.X"]] <- c(NULL
# , <comma-separated-features-vector>
# )
# dAFeats.CSM.X %<d-% c(NULL
# # Interaction feats up to varImp(RFE.X.glmnet) >= 50
# , <comma-separated-features-vector>
# , setdiff(myextract_actual_feats(predictors(rfe_fit_results)), c(NULL
# , <comma-separated-features-vector>
# ))
# )
# glb_mdl_feats_lst[["CSM.X"]] <- "%<d-% dAFeats.CSM.X"
# Check if tuning parameters make fit better; make it mdlFamily customizable ?
glb_tune_models_df <- data.frame()
# Experiment specific code to avoid caret crash
glmnet_tune_models_df <- rbind(data.frame()
,data.frame(method = "glmnet", parameter = "alpha",
vals = "0.100 0.325 0.550 0.775 1.000")
,data.frame(method = "glmnet", parameter = "lambda",
vals = "9.342e-02")
)
#avNNet
# size=[1] 3 5 7 9; decay=[0] 1e-04 0.001 0.01 0.1; bag=[FALSE]; RMSE=1.3300906
#bagEarth
# degree=1 [2] 3; nprune=64 128 256 512 [1024]; RMSE=0.6486663 (up)
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "bagEarth", parameter = "nprune", vals = "256")
# ,data.frame(method = "bagEarth", parameter = "degree", vals = "2")
# ))
#earth
# degree=[1]; nprune=2 [9] 17 25 33; RMSE=0.1334478
#gbm
# shrinkage=0.05 [0.10] 0.15 0.20 0.25; n.trees=100 150 200 [250] 300; interaction.depth=[1] 2 3 4 5; n.minobsinnode=[10]; RMSE=0.2008313
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "gbm", parameter = "shrinkage", min = 0.05, max = 0.25, by = 0.05)
# ,data.frame(method = "gbm", parameter = "n.trees", min = 100, max = 300, by = 50)
# ,data.frame(method = "gbm", parameter = "interaction.depth", min = 1, max = 5, by = 1)
# ,data.frame(method = "gbm", parameter = "n.minobsinnode", min = 10, max = 10, by = 10)
# #seq(from=0.05, to=0.25, by=0.05)
# ))
#glmnet
# alpha=0.100 [0.325] 0.550 0.775 1.000; lambda=0.0005232693 0.0024288010 0.0112734954 [0.0523269304] 0.2428800957; RMSE=0.6164891
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "glmnet", parameter = "alpha", vals = "0.550 0.775 0.8875 0.94375 1.000")
# ,data.frame(method = "glmnet", parameter = "lambda", vals = "9.858855e-05 0.0001971771 0.0009152152 0.0042480525 0.0197177130")
# ))
#nnet
# size=3 5 [7] 9 11; decay=0.0001 0.001 0.01 [0.1] 0.2; RMSE=0.9287422
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "nnet", parameter = "size", vals = "3 5 7 9 11")
# ,data.frame(method = "nnet", parameter = "decay", vals = "0.0001 0.0010 0.0100 0.1000 0.2000")
# ))
#rf # Don't bother; results are not deterministic
# mtry=2 35 68 [101] 134; RMSE=0.1339974
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "rf", parameter = "mtry", vals = "2 5 9 13 17")
# ))
#rpart
# cp=0.020 [0.025] 0.030 0.035 0.040; RMSE=0.1770237
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "rpart", parameter = "cp", vals = "0.004347826 0.008695652 0.017391304 0.021739130 0.034782609")
# ))
#svmLinear
# C=0.01 0.05 [0.10] 0.50 1.00 2.00 3.00 4.00; RMSE=0.1271318; 0.1296718
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "svmLinear", parameter = "C", vals = "0.01 0.05 0.1 0.5 1")
# ))
#svmLinear2
# cost=0.0625 0.1250 [0.25] 0.50 1.00; RMSE=0.1276354
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "svmLinear2", parameter = "cost", vals = "0.0625 0.125 0.25 0.5 1")
# ))
#svmPoly
# degree=[1] 2 3 4 5; scale=0.01 0.05 [0.1] 0.5 1; C=0.50 1.00 [2.00] 3.00 4.00; RMSE=0.1276130
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method="svmPoly", parameter="degree", min=1, max=5, by=1) #seq(1, 5, 1)
# ,data.frame(method="svmPoly", parameter="scale", vals="0.01, 0.05, 0.1, 0.5, 1")
# ,data.frame(method="svmPoly", parameter="C", vals="0.50, 1.00, 2.00, 3.00, 4.00")
# ))
#svmRadial
# sigma=[0.08674323]; C=0.25 0.50 1.00 [2.00] 4.00; RMSE=0.1614957
#glb2Sav(); all.equal(sav_models_df, glb_models_df)
glb_preproc_methods <- NULL
# c("YeoJohnson", "center.scale", "range", "pca", "ica", "spatialSign")
# Baseline prediction model feature(s)
glb_Baseline_mdl_var <- NULL # or c("<feat>")
glbMdlMetric_terms <- NULL # or matrix(c(
# 0,1,2,3,4,
# 2,0,1,2,3,
# 4,2,0,1,2,
# 6,4,2,0,1,
# 8,6,4,2,0
# ), byrow=TRUE, nrow=5)
glbMdlMetricSummary <- NULL # or "<metric_name>"
glbMdlMetricMaximize <- NULL # or FALSE (TRUE is not the default for both classification & regression)
glbMdlMetricSummaryFn <- NULL # or function(data, lev=NULL, model=NULL) {
# confusion_mtrx <- t(as.matrix(confusionMatrix(data$pred, data$obs)))
# #print(confusion_mtrx)
# #print(confusion_mtrx * glbMdlMetric_terms)
# metric <- sum(confusion_mtrx * glbMdlMetric_terms) / nrow(data)
# names(metric) <- glbMdlMetricSummary
# return(metric)
# }
glb_rcv_n_folds <- 3 # or NULL
glb_rcv_n_repeats <- 3 # or NULL
glb_clf_proba_threshold <- NULL # 0.5
# Model selection criteria
if (glb_is_regression)
glbMdlMetricsEval <- c("min.RMSE.OOB", "max.R.sq.OOB", "max.Adj.R.sq.fit", "min.RMSE.fit")
#glbMdlMetricsEval <- c("min.RMSE.fit", "max.R.sq.fit", "max.Adj.R.sq.fit")
if (glb_is_classification) {
if (glb_is_binomial)
glbMdlMetricsEval <-
c("max.Accuracy.OOB", "max.AUCROCR.OOB", "max.AUCpROC.OOB", "min.aic.fit", "max.Accuracy.fit") else
glbMdlMetricsEval <- c("max.Accuracy.OOB", "max.Kappa.OOB")
}
# select from NULL [no ensemble models], "auto" [all models better than MFO or Baseline], c(mdl_ids in glb_models_lst) [Typically top-rated models in auto]
glb_mdl_ensemble <- NULL
# "%<d-% setdiff(mygetEnsembleAutoMdlIds(), 'CSM.X.rf')"
# c(<comma-separated-mdlIds>
# )
# Only for classifications; for regressions remove "(.*)\\.prob" form the regex
# tmp_fitobs_df <- glbObsFit[, grep(paste0("^", gsub(".", "\\.", mygetPredictIds$value, fixed = TRUE), "CSM\\.X\\.(.*)\\.prob"), names(glbObsFit), value = TRUE)]; cor_mtrx <- cor(tmp_fitobs_df); cor_vctr <- sort(cor_mtrx[row.names(orderBy(~-Overall, varImp(glb_models_lst[["Ensemble.repeatedcv.glmnet"]])$imp))[1], ]); summary(cor_vctr); cor_vctr
#ntv.glm <- glm(reformulate(indep_vars, glb_rsp_var), family = "binomial", data = glbObsFit)
#step.glm <- step(ntv.glm)
glb_sel_mdl_id <- "All.X##rcv#glmnet" #select from c(NULL, "All.X##rcv#glmnet", "RFE.X##rcv#glmnet", <mdlId>)
glb_fin_mdl_id <- NULL #select from c(NULL, glb_sel_mdl_id)
glb_dsp_cols <- c(glb_id_var, glbFeatsCategory, glb_rsp_var
# List critical cols excl. glb_id_var, glbFeatsCategory & glb_rsp_var
)
# Output specs
glbOutDataVizFname <- "NYTBlogs3_obsall.csv" # choose from c(NULL, "NYTBlogs3_obsall.csv")
glb_out_obs <- NULL # select from c(NULL : default to "new", "all", "new", "trn")
glb_out_vars_lst <- list()
# glb_id_var will be the first output column, by default
glb_out_vars_lst[["Probability1"]] <-
"%<d-% mygetPredictIds(glb_rsp_var, glb_fin_mdl_id)$prob"
# glb_out_vars_lst[[glb_rsp_var_raw]] <- glb_rsp_var_raw
# glb_out_vars_lst[[paste0(head(unlist(strsplit(mygetPredictIds$value, "")), -1), collapse = "")]] <-
glbOutStackFnames <- NULL #: default
# c("ebayipads_txt_assoc1_out_bid1_stack.csv") # manual stack
# c("ebayipads_finmdl_bid1_out_nnet_1.csv") # universal stack
glb_out_pfx <- "NYTBlogs3_feat_PubDate_"
glb_save_envir <- FALSE # or TRUE
# Depict process
glb_analytics_pn <- petrinet(name = "glb_analytics_pn",
trans_df = data.frame(id = 1:6,
name = c("data.training.all","data.new",
"model.selected","model.final",
"data.training.all.prediction","data.new.prediction"),
x=c( -5,-5,-15,-25,-25,-35),
y=c( -5, 5, 0, 0, -5, 5)
),
places_df=data.frame(id=1:4,
name=c("bgn","fit.data.training.all","predict.data.new","end"),
x=c( -0, -20, -30, -40),
y=c( 0, 0, 0, 0),
M0=c( 3, 0, 0, 0)
),
arcs_df=data.frame(
begin=c("bgn","bgn","bgn",
"data.training.all","model.selected","fit.data.training.all",
"fit.data.training.all","model.final",
"data.new","predict.data.new",
"data.training.all.prediction","data.new.prediction"),
end =c("data.training.all","data.new","model.selected",
"fit.data.training.all","fit.data.training.all","model.final",
"data.training.all.prediction","predict.data.new",
"predict.data.new","data.new.prediction",
"end","end")
))
#print(ggplot.petrinet(glb_analytics_pn))
print(ggplot.petrinet(glb_analytics_pn) + coord_flip())
## Loading required package: grid
glb_analytics_avl_objs <- NULL
glb_chunks_df <- myadd_chunk(NULL, "import.data")
## label step_major step_minor label_minor bgn end elapsed
## 1 import.data 1 0 0 17.432 NA NA
1.0: import data## [1] "Reading file ./data/NYTimesBlogTrain.csv..."
## [1] "dimensions of data in ./data/NYTimesBlogTrain.csv: 6,532 rows x 10 cols"
## NewsDesk SectionName SubsectionName
## 1 Business Crosswords/Games
## 2 Culture Arts
## 3 Business Business Day Dealbook
## 4 Business Business Day Dealbook
## 5 Science Health
## 6 Science Health
## Headline
## 1 More School Daze
## 2 New 96-Page Murakami Work Coming in December
## 3 Public Pension Funds Stay Mum on Corporate Expats
## 4 Boot Camp for Bankers
## 5 Of Little Help to Older Knees
## 6 A Benefit of Legal Marijuana
## Snippet
## 1 A puzzle from Ethan Cooper that reminds me that a bill is due.
## 2 The Strange Library will arrive just three and a half months after Mr. Murakamis latest novel, Colorless Tsukuru Tazaki and His Years of Pilgrimage.
## 3 Public pension funds have major stakes in American companies moving overseas to cut their tax bills. But they are saying little about the strategy, which could hurt the nations tax base.
## 4 As they struggle to find new business to bolster sluggish earnings, banks consider the nations 25 million veterans and service members ideal customers.
## 5 Middle-aged and older patients are unlikely to benefit in the long term from surgery to repair tears in the meniscus, pads of cartilage in the knee, a new review of studies has found.
## 6 A new study has found evidence that legal access to marijuana is associated with fewer opioid overdose deaths, but researchers said their findings should not be used as the basis for the wide adoption of legalized cannabis.
## Abstract
## 1 A puzzle from Ethan Cooper that reminds me that a bill is due.
## 2 The Strange Library will arrive just three and a half months after Mr. Murakamis latest novel, Colorless Tsukuru Tazaki and His Years of Pilgrimage.
## 3 Public pension funds have major stakes in American companies moving overseas to cut their tax bills. But they are saying little about the strategy, which could hurt the nations tax base.
## 4 As they struggle to find new business to bolster sluggish earnings, banks consider the nations 25 million veterans and service members ideal customers.
## 5 Middle-aged and older patients are unlikely to benefit in the long term from surgery to repair tears in the meniscus, pads of cartilage in the knee, a new review of studies has found.
## 6 A new study has found evidence that legal access to marijuana is associated with fewer opioid overdose deaths, but researchers said their findings should not be used as the basis for the wide adoption of legalized cannabis.
## WordCount PubDate Popular UniqueID
## 1 508 2014-09-01 22:00:09 1 1
## 2 285 2014-09-01 21:14:07 0 2
## 3 1211 2014-09-01 21:05:36 0 3
## 4 1405 2014-09-01 20:43:34 1 4
## 5 181 2014-09-01 18:58:51 1 5
## 6 245 2014-09-01 18:52:22 1 6
## NewsDesk SectionName SubsectionName
## 226 Styles
## 995
## 3327
## 4753 Multimedia
## 4802 Business Crosswords/Games
## 6463 TStyle
## Headline
## 226 For Tavi Gevinson, Fashion Takes a Back Seat, for Now
## 995 Reconsidering What to Call an Extremist Group
## 3327 Clinton's Diagnosis of What's Wrong With Politics
## 4753 'Off Color' and on Target About Race in America
## 4802 Daniel Finkel's Circle-Toss Game
## 6463 Entering the Void
## Snippet
## 226 Tavi Gevinson, the teenage fashion star turned Broadway actress, wont be much of a player at New York Fashion Week this season.
## 995 Editors have decided to adjust how The Times refer to an Islamic extremist group that controls territory in Syria and Iraq.
## 3327 Hillary Rodham Clinton continued to laugh off questions about her presidential aspirations on Tuesday, but she did shed some light on what she thinks is wrong in Washington.
## 4753 Off Color, a New York Times video series, looks at how artists of color are making sharp social commentary about race in America through comedy and performance.
## 4802 By math educator Daniel Finkel, a puzzle thats childs play. Can you figure it out?
## 6463 The Spanish artist Miquel Barcel closely examines the basic materials of life in response to Edward Hirsch questioning his own belief in a higher power.
## Abstract
## 226 Tavi Gevinson, the teenage fashion star turned Broadway actress, wont be much of a player at New York Fashion Week this season.
## 995 Editors have decided to adjust how The Times refer to an Islamic extremist group that controls territory in Syria and Iraq.
## 3327 Hillary Rodham Clinton continued to laugh off questions about her presidential aspirations on Tuesday, but she did shed some light on what she thinks is wrong in Washington.
## 4753 Off Color, a New York Times video series, looks at how artists of color are making sharp social commentary about race in America through comedy and performance.
## 4802 By math educator Daniel Finkel, a puzzle thats childs play. Can you figure it out?
## 6463 The Spanish artist Miquel Barcel closely examines the basic materials of life in response to Edward Hirsch questioning his own belief in a higher power.
## WordCount PubDate Popular UniqueID
## 226 459 2014-09-04 16:55:57 0 226
## 995 301 2014-09-15 16:05:13 0 995
## 3327 236 2014-10-14 14:45:51 0 3327
## 4753 393 2014-11-02 05:00:13 0 4753
## 4802 1628 2014-11-03 12:00:04 1 4802
## 6463 264 2014-11-27 12:00:09 0 6463
## NewsDesk SectionName SubsectionName
## 6527 Foreign
## 6528 Opinion Room For Debate
## 6529 Foreign
## 6530 TStyle
## 6531 Multimedia
## 6532 Business
## Headline
## 6527 1914: Russians Dominate in East Poland
## 6528 Finding a Secretary of Defense
## 6529 1889: Metropolitan Opera House Reopens in New York
## 6530 The Daily Gift: Picasso Plates for Creative Dining
## 6531 Racing From New York to Barcelona
## 6532 Math Anxiety: Why Hollywood Makes Robots of Alan Turing and Other Geniuses
## Snippet
## 6527 From the International Herald Tribune archives: Russians dominate in East Poland in 1914.
## 6528 If Chuck Hagel isn't the right Pentagon chief to respond to an onslaught of global crises, who is?
## 6529 From the International Herald Tribune archives: The Metropolitan Opera House reopens in New York in 1889.
## 6530 Each day until Christmas, the editors of T share a new holiday gift idea.
## 6531 A sailboat race from New York to Barcelona was the setting for a thrilling and sometimes terrifying video about this challenging sport.
## 6532 The visionary who stares at formulas written on walls or mirrors or better yet, thin air has become a Hollywood trope. So has the depiction of the genius who cant connect with real people.
## Abstract
## 6527 From the International Herald Tribune archives: Russians dominate in East Poland in 1914.
## 6528 If Chuck Hagel isn't the right Pentagon chief to respond to an onslaught of global crises, who is?
## 6529 From the International Herald Tribune archives: The Metropolitan Opera House reopens in New York in 1889.
## 6530 Each day until Christmas, the editors of T share a new holiday gift idea.
## 6531 A sailboat race from New York to Barcelona was the setting for a thrilling and sometimes terrifying video about this challenging sport.
## 6532 The visionary who stares at formulas written on walls or mirrors or better yet, thin air has become a Hollywood trope. So has the depiction of the genius who cant connect with real people.
## WordCount PubDate Popular UniqueID
## 6527 176 2014-11-30 13:48:40 0 6527
## 6528 1597 2014-11-30 13:27:23 0 6528
## 6529 214 2014-11-30 09:44:57 0 6529
## 6530 61 2014-11-30 09:00:43 0 6530
## 6531 441 2014-11-30 09:00:22 0 6531
## 6532 921 2014-11-30 07:00:40 0 6532
## 'data.frame': 6532 obs. of 10 variables:
## $ NewsDesk : chr "Business" "Culture" "Business" "Business" ...
## $ SectionName : chr "Crosswords/Games" "Arts" "Business Day" "Business Day" ...
## $ SubsectionName: chr "" "" "Dealbook" "Dealbook" ...
## $ Headline : chr "More School Daze" "New 96-Page Murakami Work Coming in December" "Public Pension Funds Stay Mum on Corporate Expats" "Boot Camp for Bankers" ...
## $ Snippet : chr "A puzzle from Ethan Cooper that reminds me that a bill is due." "The Strange Library will arrive just three and a half months after Mr. Murakamis latest novel, Colorless Tsukuru Tazaki and His"| __truncated__ "Public pension funds have major stakes in American companies moving overseas to cut their tax bills. But they are saying little"| __truncated__ "As they struggle to find new business to bolster sluggish earnings, banks consider the nations 25 million veterans and service "| __truncated__ ...
## $ Abstract : chr "A puzzle from Ethan Cooper that reminds me that a bill is due." "The Strange Library will arrive just three and a half months after Mr. Murakamis latest novel, Colorless Tsukuru Tazaki and His"| __truncated__ "Public pension funds have major stakes in American companies moving overseas to cut their tax bills. But they are saying little"| __truncated__ "As they struggle to find new business to bolster sluggish earnings, banks consider the nations 25 million veterans and service "| __truncated__ ...
## $ WordCount : int 508 285 1211 1405 181 245 258 893 1077 188 ...
## $ PubDate : chr "2014-09-01 22:00:09" "2014-09-01 21:14:07" "2014-09-01 21:05:36" "2014-09-01 20:43:34" ...
## $ Popular : int 1 0 0 1 1 1 0 1 1 0 ...
## $ UniqueID : int 1 2 3 4 5 6 7 8 9 10 ...
## - attr(*, "comment")= chr "glbObsTrn"
## NULL
## [1] "Reading file ./data/NYTimesBlogTest.csv..."
## [1] "dimensions of data in ./data/NYTimesBlogTest.csv: 1,870 rows x 9 cols"
## NewsDesk SectionName SubsectionName
## 1 Culture
## 2 Culture Arts
## 3 Business Crosswords/Games
## 4 Business Business Day Dealbook
## 5 Science Health
## 6 Science Health
## Headline
## 1 'Birdman' Tops the Gothams
## 2 'Sleepy Hollow' Recap: A Not-So-Shocking Death
## 3 Drinking Buddy For Falstaff
## 4 Encouraging Public Service, Through Wall Street's 'Revolving Door'
## 5 Therapy Prevents Repeat Suicide Attempts
## 6 Hoping for a Good Death
## Snippet
## 1 The backstage tale won two awards; Citizenfour, the Edward Snowden documentary, was also a winner.
## 2 In the fall season finale, a question of where the series has many places to go.
## 3 In which Timothy Polin reveals his potty mouth.
## 4 The debate about pay for Wall Street executives who take government jobs appears to be based more on a populist shakedown than on good public policy.
## 5 Short-term psychotherapy may be an effective way to prevent repeated suicide attempts.
## 6 What I hadnt considered before my fathers heart attack was the precise meaning of not wanting to live hooked up to machines.
## Abstract
## 1 The backstage tale won two awards; Citizenfour, the Edward Snowden documentary, was also a winner.
## 2 In the fall season finale, a question of where the series has many places to go.
## 3 In which Timothy Polin reveals his potty mouth.
## 4 The debate about pay for Wall Street executives who take government jobs appears to be based more on a populist shakedown than on good public policy.
## 5 Short-term psychotherapy may be an effective way to prevent repeated suicide attempts.
## 6 What I hadnt considered before my fathers heart attack was the precise meaning of not wanting to live hooked up to machines.
## WordCount PubDate UniqueID
## 1 111 2014-12-01 22:45:24 6533
## 2 558 2014-12-01 22:01:34 6534
## 3 788 2014-12-01 22:00:26 6535
## 4 915 2014-12-01 21:04:13 6536
## 5 213 2014-12-01 19:13:20 6537
## 6 938 2014-12-01 19:05:12 6538
## NewsDesk SectionName SubsectionName
## 3 Business Crosswords/Games
## 334 OpEd Opinion
## 725 TStyle
## 732 Business Business Day Dealbook
## 752 Business Business Day Dealbook
## 864
## Headline
## 3 Drinking Buddy For Falstaff
## 334 Facts & Figures: America’s Unique Take on Maternity Leave
## 725 Ansel Elgort Buttons Up in Brioni
## 732 A Shake-Up as the Financial World Infiltrates Philanthropy
## 752 Coupang, a South Korean E-Commerce Site, Raises $300 Million
## 864 Today in Politics
## Snippet
## 3 In which Timothy Polin reveals his potty mouth.
## 334 In the U.S., paid parental leave is more of a perk than a guarantee.
## 725 The actor brought a tinge of youthfulness to the classic Italian houses retro-tailored look.
## 732 Donor-advised funds help investors get deductions for charitable donations in one year, but society doesnt get the benefit of the money right away.
## 752 The latest financing round underscores Coupangs maturity and its ambitions to one day be a publicly traded company.
## 864 The 113th Congress is concluding with partisan brinksmanship and one last mad scramble for votes to pass a $1.1 trillion spending package.
## Abstract
## 3 In which Timothy Polin reveals his potty mouth.
## 334 In the U.S., paid parental leave is more of a perk than a guarantee.
## 725 The actor brought a tinge of youthfulness to the classic Italian houses retro-tailored look.
## 732 Donor-advised funds help investors get deductions for charitable donations in one year, but society doesnt get the benefit of the money right away.
## 752 The latest financing round underscores Coupangs maturity and its ambitions to one day be a publicly traded company.
## 864 The 113th Congress is concluding with partisan brinksmanship and one last mad scramble for votes to pass a $1.1 trillion spending package.
## WordCount PubDate UniqueID
## 3 788 2014-12-01 22:00:26 6535
## 334 160 2014-12-04 11:45:20 6866
## 725 89 2014-12-10 12:30:47 7257
## 732 1172 2014-12-10 12:00:38 7264
## 752 353 2014-12-10 08:30:41 7284
## 864 1544 2014-12-11 07:09:25 7396
## NewsDesk SectionName SubsectionName
## 1865
## 1866 Business Technology
## 1867 Metro N.Y. / Region
## 1868 Multimedia
## 1869 Foreign World Asia Pacific
## 1870 Science Health
## Headline
## 1865 Today in Politics
## 1866 Uber Suspends Operations in Spain
## 1867 New York Today: The Year in News
## 1868 New Year, Old Memories, in Times Square
## 1869 Hong Kong Police Criticized After 14-Year-Old's Detention
## 1870 The Super-Short Workout and Other Fitness Trends
## Snippet
## 1865 House Republicans are ending the year on a defensive note over Representative Steve Scalises 2002 speech to a white supremacist group.
## 1866 In a first in the growing pushback against Ubers global expansion, a judges ruling barred telecommunications operators and banks from supporting the companys services.
## 1867 Wednesday: The most read stories of 2014, teeth-chattering cold, and its New Years Eve.
## 1868 What happens when you combine Burning Man, Independence Day fireworks, the last day of school and a full-contact Black Friday sale-a-bration? New Years Eve in Times Square.
## 1869 The authorities have been accused of trying to intimidate young pro-democracy protesters and their families after a 14-year-old girl was detained on suspicion of drawing flowers in chalk near government headquarters and sent to a juvenile home.
## 1870 The big story in exercise science this year was the super-short workout, although many other fitness-related themes emerged in 2014.
## Abstract
## 1865 House Republicans are ending the year on a defensive note over Representative Steve Scalises 2002 speech to a white supremacist group.
## 1866 In a first in the growing pushback against Ubers global expansion, a judges ruling barred telecommunications operators and banks from supporting the companys services.
## 1867 Wednesday: The most read stories of 2014, teeth-chattering cold, and its New Years Eve.
## 1868 What happens when you combine Burning Man, Independence Day fireworks, the last day of school and a full-contact Black Friday sale-a-bration? New Years Eve in Times Square.
## 1869 The authorities have been accused of trying to intimidate young pro-democracy protesters and their families after a 14-year-old girl was detained on suspicion of drawing flowers in chalk near government headquarters and sent to a juvenile home.
## 1870 The big story in exercise science this year was the super-short workout, although many other fitness-related themes emerged in 2014.
## WordCount PubDate UniqueID
## 1865 1616 2014-12-31 07:03:46 8397
## 1866 292 2014-12-31 06:09:32 8398
## 1867 1010 2014-12-31 06:06:58 8399
## 1868 387 2014-12-31 05:00:19 8400
## 1869 717 2014-12-31 04:16:29 8401
## 1870 818 2014-12-31 00:01:10 8402
## 'data.frame': 1870 obs. of 9 variables:
## $ NewsDesk : chr "Culture" "Culture" "Business" "Business" ...
## $ SectionName : chr "" "Arts" "Crosswords/Games" "Business Day" ...
## $ SubsectionName: chr "" "" "" "Dealbook" ...
## $ Headline : chr "'Birdman' Tops the Gothams" "'Sleepy Hollow' Recap: A Not-So-Shocking Death" "Drinking Buddy For Falstaff" "Encouraging Public Service, Through Wall Street's 'Revolving Door'" ...
## $ Snippet : chr "The backstage tale won two awards; Citizenfour, the Edward Snowden documentary, was also a winner." "In the fall season finale, a question of where the series has many places to go." "In which Timothy Polin reveals his potty mouth." "The debate about pay for Wall Street executives who take government jobs appears to be based more on a populist shakedown than "| __truncated__ ...
## $ Abstract : chr "The backstage tale won two awards; Citizenfour, the Edward Snowden documentary, was also a winner." "In the fall season finale, a question of where the series has many places to go." "In which Timothy Polin reveals his potty mouth." "The debate about pay for Wall Street executives who take government jobs appears to be based more on a populist shakedown than "| __truncated__ ...
## $ WordCount : int 111 558 788 915 213 938 1336 2644 752 99 ...
## $ PubDate : chr "2014-12-01 22:45:24" "2014-12-01 22:01:34" "2014-12-01 22:00:26" "2014-12-01 21:04:13" ...
## $ UniqueID : int 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 ...
## - attr(*, "comment")= chr "glbObsNew"
## NULL
## [1] "Partition stats:"
## Loading required package: sqldf
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
## Loading required package: tcltk
## Popular .src .n
## 1 0 Train 5439
## 2 NA Test 1870
## 3 1 Train 1093
## Popular .src .n
## 1 0 Train 5439
## 2 NA Test 1870
## 3 1 Train 1093
## .src .n
## 1 Train 6532
## 2 Test 1870
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Loading required package: lazyeval
## Loading required package: gdata
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
##
## The following objects are masked from 'package:dplyr':
##
## combine, first, last
##
## The following object is masked from 'package:stats':
##
## nobs
##
## The following object is masked from 'package:utils':
##
## object.size
## [1] "Found 0 duplicates by all features:"
## NULL
## [1] "Partition stats:"
## Popular .src .n
## 1 0 Train 5439
## 2 NA Test 1870
## 3 1 Train 1093
## Popular .src .n
## 1 0 Train 5439
## 2 NA Test 1870
## 3 1 Train 1093
## .src .n
## 1 Train 6532
## 2 Test 1870
## label step_major step_minor label_minor bgn end elapsed
## 1 import.data 1 0 0 17.432 34.359 16.927
## 2 inspect.data 2 0 0 34.360 NA NA
2.0: inspect data## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1870 rows containing non-finite values (stat_bin).
## Loading required package: reshape2
## Popular.0 Popular.1 Popular.NA
## Test NA NA 1870
## Train 5439 1093 NA
## Popular.0 Popular.1 Popular.NA
## Test NA NA 1
## Train 0.8326699 0.1673301 NA
## [1] "numeric data missing in glbObsAll: "
## Popular
## 1870
## [1] "numeric data w/ 0s in glbObsAll: "
## WordCount Popular
## 109 5439
## [1] "numeric data w/ Infs in glbObsAll: "
## named integer(0)
## [1] "numeric data w/ NaNs in glbObsAll: "
## named integer(0)
## [1] "string data missing in glbObsAll: "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2408 2899 6176 0 13
## Abstract PubDate
## 17 0
## Popular Popular.fctr .n
## 1 0 N 5439
## 2 NA <NA> 1870
## 3 1 Y 1093
## Warning: Removed 1 rows containing missing values (position_stack).
## Popular.fctr.N Popular.fctr.Y Popular.fctr.NA
## Test NA NA 1870
## Train 5439 1093 NA
## Popular.fctr.N Popular.fctr.Y Popular.fctr.NA
## Test NA NA 1
## Train 0.8326699 0.1673301 NA
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
##
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## label step_major step_minor label_minor bgn end elapsed
## 2 inspect.data 2 0 0 34.360 37.863 3.503
## 3 scrub.data 2 1 1 37.863 NA NA
2.1: scrub data## [1] "numeric data missing in glbObsAll: "
## Popular Popular.fctr
## 1870 1870
## [1] "numeric data w/ 0s in glbObsAll: "
## WordCount Popular
## 109 5439
## [1] "numeric data w/ Infs in glbObsAll: "
## named integer(0)
## [1] "numeric data w/ NaNs in glbObsAll: "
## named integer(0)
## [1] "string data missing in glbObsAll: "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2408 2899 6176 0 13
## Abstract PubDate
## 17 0
## label step_major step_minor label_minor bgn end elapsed
## 3 scrub.data 2 1 1 37.863 38.948 1.086
## 4 transform.data 2 2 2 38.949 NA NA
2.2: transform data## [1] "Creating new feature: NDSSName.my..."
## [1] "Creating new feature: WordCount.log1p..."
## [1] "Creating new feature: WordCount.root2..."
## [1] "Creating new feature: WordCount.nexp..."
## label step_major step_minor label_minor bgn end elapsed
## 4 transform.data 2 2 2 38.949 39.267 0.318
## 5 extract.features 3 0 0 39.267 NA NA
3.0: extract features## label step_major step_minor label_minor bgn end
## 1 extract.features_bgn 1 0 0 39.324 NA
## elapsed
## 1 NA
## label step_major step_minor label_minor
## 1 extract.features_bgn 1 0 0
## 2 extract.features_factorize.str.vars 2 0 0
## bgn end elapsed
## 1 39.324 39.335 0.011
## 2 39.335 NA NA
## NewsDesk SectionName SubsectionName Headline
## "NewsDesk" "SectionName" "SubsectionName" "Headline"
## Snippet Abstract PubDate .src
## "Snippet" "Abstract" "PubDate" ".src"
## NDSSName.my
## "NDSSName.my"
## Warning: Creating factors of string variable: NDSSName.my: # of unique
## values: 21
## label step_major step_minor label_minor
## 2 extract.features_factorize.str.vars 2 0 0
## 3 extract.features_xtract.DateTime.vars 3 0 0
## bgn end elapsed
## 2 39.335 39.354 0.019
## 3 39.354 NA NA
## [1] "Extracting features from DateTime(s): PubDate"
## Loading required package: XML
## [1] "**********"
## [1] "Consider adding state & city holidays for glbFeatsDateTime: PubDate"
## [1] "**********"
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## [1] "Missing data for numerics:"
## PubDate.last32.log1p.ctg
## 30
## label step_major step_minor label_minor
## 3 extract.features_xtract.DateTime.vars 3 0 0
## 4 extract.features_end 4 0 0
## bgn end elapsed
## 3 39.354 45.047 5.693
## 4 45.047 NA NA
## label step_major step_minor label_minor
## 3 extract.features_xtract.DateTime.vars 3 0 0
## 2 extract.features_factorize.str.vars 2 0 0
## 1 extract.features_bgn 1 0 0
## bgn end elapsed duration
## 3 39.354 45.047 5.693 5.693
## 2 39.335 39.354 0.019 0.019
## 1 39.324 39.335 0.011 0.011
## [1] "Total Elapsed Time: 45.047 secs"
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## label step_major step_minor label_minor bgn end
## 5 extract.features 3 0 0 39.267 46.381
## 6 manage.missing.data 3 1 1 46.382 NA
## elapsed
## 5 7.114
## 6 NA
3.1: manage missing data## [1] "numeric data missing in : "
## Popular Popular.fctr
## 1870 1870
## [1] "numeric data w/ 0s in : "
## WordCount Popular WordCount.log1p
## 109 5439 109
## WordCount.root2 WordCount.nexp PubDate.wkday.fctr
## 109 2044 378
## PubDate.wkend PubDate.hlday PubDate.day.minutes
## 7787 8160 5
## PubDate.last2.log1p PubDate.last4.log1p PubDate.last8.log1p
## 2 4 8
## PubDate.last16.log1p PubDate.last32.log1p PubDate.last2.log1p.ctg
## 16 32 42
## PubDate.last4.log1p.ctg PubDate.last8.log1p.ctg PubDate.last16.log1p.ctg
## 84 168 336
## PubDate.last32.log1p.ctg
## 670
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2408 2899 6176 0 13
## Abstract PubDate NDSSName.my
## 17 0 0
## [1] "numeric data missing in : "
## Popular Popular.fctr
## 1870 1870
## [1] "numeric data w/ 0s in : "
## WordCount Popular WordCount.log1p
## 109 5439 109
## WordCount.root2 WordCount.nexp PubDate.wkday.fctr
## 109 2044 378
## PubDate.wkend PubDate.hlday PubDate.day.minutes
## 7787 8160 5
## PubDate.last2.log1p PubDate.last4.log1p PubDate.last8.log1p
## 2 4 8
## PubDate.last16.log1p PubDate.last32.log1p PubDate.last2.log1p.ctg
## 16 32 42
## PubDate.last4.log1p.ctg PubDate.last8.log1p.ctg PubDate.last16.log1p.ctg
## 84 168 336
## PubDate.last32.log1p.ctg
## 670
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2408 2899 6176 0 13
## Abstract PubDate NDSSName.my
## 17 0 0
## label step_major step_minor label_minor bgn end
## 6 manage.missing.data 3 1 1 46.382 47.49
## 7 cluster.data 3 2 2 47.491 NA
## elapsed
## 6 1.108
## 7 NA
3.2: cluster datamycompute_entropy_df <- function(obs_df, entropy_var, by_var=NULL) {
require(lazyeval)
require(dplyr)
require(tidyr)
if (is.null(by_var)) {
by_var <- ".default"
obs_df$.default <- as.factor(".default")
}
if (!any(grepl(".clusterid", names(obs_df), fixed=TRUE)))
obs_df$.clusterid <- 1
cluster_df <- obs_df %>%
count_(c(by_var, ".clusterid", entropy_var)) %>%
dplyr::filter(n > 0) %>%
dplyr::filter_(interp(~(!is.na(var)), var=as.name(entropy_var))) %>%
unite_(paste0(by_var, ".clusterid"),
c(interp(by_var), ".clusterid")) %>%
spread_(interp(entropy_var), "n", fill=0)
# head(cluster_df)
# sum(cluster_df$n)
tmp.entropy <- sapply(1:nrow(cluster_df),
function(row) entropy(as.numeric(cluster_df[row, -1]), method = "ML"))
tmp.knt <- sapply(1:nrow(cluster_df),
function(row) sum(as.numeric(cluster_df[row, -1])))
cluster_df$.entropy <- tmp.entropy; cluster_df$.knt <- tmp.knt
#print(cluster_df)
return(cluster_df)
}
if (glb_cluster) {
require(proxy)
#require(hash)
require(dynamicTreeCut)
require(entropy)
require(tidyr)
require(ggdendro)
mywgtdcosine_dist <- function(x, y=NULL, weights=NULL) {
if (!inherits(x, "matrix"))
x <- as.matrix(x)
if (is.null(weights))
weights <- rep(1, ncol(x))
wgtsx <- matrix(rep(weights / sum(weights), nrow(x)), nrow = nrow(x),
byrow = TRUE)
wgtdx <- x * wgtsx
wgtdxsqsum <- as.matrix(rowSums((x ^ 2) * wgtsx), byrow=FALSE)
denom <- sqrt(wgtdxsqsum %*% t(wgtdxsqsum))
ret_mtrx <- 1 - ((sum(weights) ^ 1) * (wgtdx %*% t(wgtdx)) / denom)
ret_mtrx[is.nan(ret_mtrx)] <- 1
diag(ret_mtrx) <- 0
return(ret_mtrx)
}
#pr_DB$delete_entry("mywgtdcosine");
# Need to do this only once across runs ?
if (!pr_DB$entry_exists("mywgtdcosine")) {
pr_DB$set_entry(FUN = mywgtdcosine_dist, names = c("mywgtdcosine"))
pr_DB$modify_entry(names="mywgtdcosine", type="metric", loop=FALSE)
}
#pr_DB$get_entry("mywgtdcosine")
# glb_hash <- hash(key=unique(glbObsAll$myCategory),
# values=1:length(unique(glbObsAll$myCategory)))
# glb_hash_lst <- hash(key=unique(glbObsAll$myCategory),
# values=1:length(unique(glbObsAll$myCategory)))
#stop(here"); glb2Sav(); glbObsAll <- savObsAll
cluster_vars <- grep(paste0("[",
toupper(paste0(substr(glbFeatsText, 1, 1), collapse = "")),
"]\\.[PT]\\."),
names(glbObsAll), value = TRUE)
# Assign correlations with rsp_var as weights for cosine distance
print("Clustering features: ")
cluster_vars_df <- data.frame(abs.cor.y = abs(cor(
glbObsAll[glbObsAll$.src == "Train", cluster_vars],
as.numeric(glbObsAll[glbObsAll$.src == "Train", glb_rsp_var]),
use = "pairwise.complete.obs")))
print(tail(cluster_vars_df <- orderBy(~ abs.cor.y,
subset(cluster_vars_df, !is.na(abs.cor.y))), 5))
print(sprintf(" .rnorm cor: %0.4f",
cor(glbObsAll[glbObsAll$.src == "Train", ".rnorm"],
as.numeric(glbObsAll[glbObsAll$.src == "Train", glb_rsp_var]),
use = "pairwise.complete.obs")))
print(sprintf("glbObsAll Entropy: %0.4f",
allobs_ent <- entropy(table(glbObsAll[, glb_cluster_entropy_var]),
method="ML")))
print(category_df <- mycompute_entropy_df(obs_df=glbObsAll,
entropy_var=glb_cluster_entropy_var,
by_var=glbFeatsCategory))
print(sprintf("glbObsAll$%s Entropy: %0.4f (%0.4f pct)",
glbFeatsCategory,
category_ent <- weighted.mean(category_df$.entropy, category_df$.knt),
100 * category_ent / allobs_ent))
glbObsAll$.clusterid <- 1
#print(max(table(glbObsAll$myCategory.fctr) / 20))
#stop(here"); glb2Sav()
grp_ids <- sort(unique(glbObsAll[, glbFeatsCategory]))
glb_cluster_size_df_lst <- list()
png(paste0(glb_out_pfx, "FeatsTxtClusters.png"),
width = 480 * 2, height = 480 * length(grp_ids))
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow = length(grp_ids), ncol = 2)))
pltIx <- 1
for (grp in grp_ids) {
# if (grep(grp, levels(grp_ids)) <= 6) next
# if (grep(grp, levels(grp_ids)) > 9) next
# if (grep(grp, levels(grp_ids)) != 10) next
print(sprintf("Category: %s", grp))
ctgry_allobs_df <- glbObsAll[glbObsAll[, glbFeatsCategory] == grp, ]
if (!inherits(ctgry_allobs_df[, glb_cluster_entropy_var], "factor"))
ctgry_allobs_df[, glb_cluster_entropy_var] <-
as.factor(ctgry_allobs_df[, glb_cluster_entropy_var])
#dstns_dist <- proxy::dist(ctgry_allobs_df[, cluster_vars], method = "cosine")
dstns_dist <- proxy::dist(ctgry_allobs_df[, row.names(cluster_vars_df)],
method = "mywgtdcosine",
weights = cluster_vars_df$abs.cor.y)
# Custom distance functions return a crossdist object
#dstns_mtrx <- as.matrix(dstns_dist)
dstns_mtrx <- matrix(as.vector(dstns_dist), nrow=attr(dstns_dist, "dim")[1],
dimnames=attr(dstns_dist, "dimnames"))
dstns_dist <- as.dist(dstns_mtrx)
print(sprintf("max distance(%0.4f) pair:", max(dstns_mtrx)))
# print(dim(dstns_mtrx))
# print(sprintf("which.max: %d", which.max(dstns_mtrx)))
row_ix <- ceiling(which.max(dstns_mtrx) / ncol(dstns_mtrx))
col_ix <- which.max(dstns_mtrx[row_ix, ])
# print(sprintf("row_ix: %d", row_ix)); print(sprintf("col_ix: %d", col_ix));
# print(dim(ctgry_allobs_df))
print(ctgry_allobs_df[c(row_ix, col_ix),
c(glb_id_var, glb_cluster_entropy_var, glbFeatsCategory, glbFeatsText, cluster_vars)])
min_dstns_mtrx <- dstns_mtrx
diag(min_dstns_mtrx) <- 1
# Float representations issue -2.22e-16 vs. 0.0000
print(sprintf("min distance(%0.4f) pair:", min(min_dstns_mtrx)))
row_ix <- ceiling(which.min(min_dstns_mtrx) / ncol(min_dstns_mtrx))
col_ix <- which.min(min_dstns_mtrx[row_ix, ])
print(ctgry_allobs_df[c(row_ix, col_ix),
c(glb_id_var, glb_cluster_entropy_var, glbFeatsCategory, glbFeatsText,
cluster_vars)])
set.seed(glb_cluster.seed)
clusters <- hclust(dstns_dist, method = "ward.D2")
# Workaround to avoid "Error in cutree(dendro, h = heightcutoff) : the 'height' component of 'tree' is not sorted (increasingly)"
if (with(clusters,all.equal(height,sort(height))))
clusters$height <- round(clusters$height,6)
clusters$labels <- ctgry_allobs_df[, glb_id_var]
clustersDD <- dendro_data(clusters)
clustersDD$labels[, glb_rsp_var] <- sapply(clustersDD$labels$label, function(id)
ctgry_allobs_df[id == ctgry_allobs_df[, glb_id_var], glb_rsp_var])
print(ggdendrogram(clustersDD, rotate = TRUE, size = 2) +
geom_point(data = clustersDD$labels,
aes_string(x = "x", color = glb_rsp_var), y = min(clustersDD$segments$y)) +
coord_flip(ylim = c(min(clustersDD$segments$y),
max(clustersDD$segments$y))) +
ggtitle(grp),
vp = viewport(layout.pos.row = pltIx, layout.pos.col = 1))
# clusters$labels <- ctgry_allobs_df[, glb_id_var]
# clustersDD <- dendro_data(clusters)
# clustersDD$labels$color <- sapply(clustersDD$labels$label, function(id)
# ctgry_allobs_df[id == ctgry_allobs_df[, glb_id_var], glb_rsp_var])
# print(ggdendrogram(clustersDD, rotate = TRUE, size = 2) +
# geom_point(data = clustersDD$labels,
# aes_string(x = "x", color = "color"), y = min(clustersDD$segments$y)) +
# coord_flip(ylim = c(min(clustersDD$segments$y),
# max(clustersDD$segments$y))))
# print(ggdendrogram(clustersDD, rotate = TRUE, size = 2) +
# geom_point(data = clustersDD$labels,
# aes_string(x = "x", y = "y", color = "color")))
# myplclust(clusters, lab=ctgry_allobs_df[, glb_id_var],
# lab.col=unclass(ctgry_allobs_df[, glb_cluster_entropy_var]))
opt_minclustersize_df <- data.frame(minclustersize = nrow(ctgry_allobs_df),
entropy = entropy(table(ctgry_allobs_df[, glb_cluster_entropy_var]),
method = "ML"))
for (minclustersize in
as.integer(seq(nrow(ctgry_allobs_df) / 2, nrow(ctgry_allobs_df) / 10,
length = 5))) {
clusterGroups <- cutreeDynamic(clusters, minClusterSize = minclustersize,
method = "tree", deepSplit = 0)
# Unassigned groups are labeled 0; the largest group has label 1
clusterGroups[clusterGroups == 0] <- 1
ctgry_allobs_df$.clusterid <- clusterGroups
ctgry_clstrs_df <- mycompute_entropy_df(ctgry_allobs_df,
glb_cluster_entropy_var)
opt_minclustersize_df <- rbind(opt_minclustersize_df,
data.frame(minclustersize = minclustersize,
entropy = weighted.mean(ctgry_clstrs_df$.entropy, ctgry_clstrs_df$.knt)))
}
opt_minclustersize <-
opt_minclustersize_df$minclustersize[which.min(opt_minclustersize_df$entropy)]
opt_minclustersize_df$.color <-
ifelse(opt_minclustersize_df$minclustersize == opt_minclustersize,
"red", "blue")
print(ggplot(data = opt_minclustersize_df,
mapping = aes(x = minclustersize, y = entropy)) +
geom_point(aes(color = .color)) + scale_color_identity() +
guides(color = "none") + geom_line(),
vp = viewport(layout.pos.row = pltIx, layout.pos.col = 2))
glb_cluster_size_df_lst[[grp]] <- opt_minclustersize_df
# select minclustersize that minimizes entropy
clusterGroups <- cutreeDynamic(clusters, minClusterSize = opt_minclustersize,
method = "tree",
deepSplit = 0)
# Unassigned groups are labeled 0; the largest group has label 1
table(clusterGroups, ctgry_allobs_df[, glb_cluster_entropy_var],
useNA = "ifany")
clusterGroups[clusterGroups == 0] <- 1
table(clusterGroups, ctgry_allobs_df[, glb_cluster_entropy_var], useNA = "ifany")
glbObsAll[glbObsAll[, glbFeatsCategory] == grp,]$.clusterid <-
clusterGroups
pltIx <- pltIx + 1
}
dev.off()
#all.equal(savObsAll_clusterid, glbObsAll$.clusterid)
print(cluster_df <- mycompute_entropy_df(obs_df=glbObsAll,
entropy_var=glb_cluster_entropy_var,
by_var=glbFeatsCategory))
print(sprintf("glbObsAll$%s$.clusterid Entropy: %0.4f (%0.4f pct)",
glbFeatsCategory,
cluster_ent <- weighted.mean(cluster_df$.entropy, cluster_df$.knt),
100 * cluster_ent / category_ent))
glbObsAll$.clusterid.fctr <- as.factor(glbObsAll$.clusterid)
# .clusterid.fctr is created automatically (probably ?) later
glbFeatsExclude <- c(glbFeatsExclude, ".clusterid")
if (!is.null(glbFeatsCategory))
# glbFeatsInteractionOnly[ifelse(grepl("\\.fctr", glbFeatsCategory),
# glbFeatsCategory,
# paste0(glbFeatsCategory, ".fctr"))] <-
# c(".clusterid.fctr")
glbFeatsInteractionOnly[[".clusterid.fctr"]] <-
ifelse(grepl("\\.fctr", glbFeatsCategory), glbFeatsCategory,
paste0(glbFeatsCategory, ".fctr"))
if (glbFeatsTextClusterVarsExclude)
glbFeatsExclude <- c(glbFeatsExclude, cluster_vars)
}
# Last call for data modifications
#stop(here") # savObsAll <- glbObsAll
# glbObsAll[(glbObsAll$PropR == 0.75) & (glbObsAll$State == "Hawaii"), "PropR.fctr"] <- "N"
# Re-partition
glbObsTrn <- subset(glbObsAll, .src == "Train")
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
glbObsNew <- subset(glbObsAll, .src == "Test")
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
glb_chunks_df <- myadd_chunk(glb_chunks_df, "partition.data.training", major.inc = TRUE)
## label step_major step_minor label_minor bgn end
## 7 cluster.data 3 2 2 47.491 47.817
## 8 partition.data.training 4 0 0 47.818 NA
## elapsed
## 7 0.326
## 8 NA
4.0: partition data training## [1] "Prediction Hints by Catgeory:"
## NDSSName.my.fctr Popular.0 Popular.1 .n.tst .strata.0 .strata.1
## 5 #U.S.#Education 325 NA 89 82 17
## 10 Culture## 1 NA 70 1 13
## 12 Foreign#World# 172 NA 47 44 9
## 21 myOther 38 NA 5 5 1
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Loading required package: sampling
##
## Attaching package: 'sampling'
##
## The following objects are masked from 'package:survival':
##
## cluster, strata
##
## The following object is masked from 'package:caret':
##
## cluster
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Popular.0 Popular.1 Popular.NA
## NA NA 1870
## Fit 3941 863 NA
## OOB 1498 230 NA
## Popular.0 Popular.1 Popular.NA
## NA NA 1
## Fit 0.8203580 0.1796420 NA
## OOB 0.8668981 0.1331019 NA
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## NDSSName.my.fctr .n.Fit .n.OOB .n.Tst .freqRatio.Fit
## 1 ## 913 371 342 0.190049958
## 6 Business#BusinessDay#Dealbook 629 323 304 0.130932556
## 11 Culture#Arts# 490 185 174 0.101998335
## 15 OpEd#Opinion# 437 89 164 0.090965862
## 9 Business#Technology# 213 126 114 0.044338052
## 19 TStyle## 623 101 105 0.129683597
## 5 #U.S.#Education 243 82 89 0.050582848
## 10 Culture## NA 1 70 NA
## 14 Metro#N.Y./Region# 128 70 67 0.026644463
## 18 Styles#U.S.# 127 50 61 0.026436303
## 16 Science#Health# 148 48 57 0.030807660
## 13 Foreign#World#AsiaPacific 150 53 56 0.031223980
## 2 #Multimedia# 92 49 52 0.019150708
## 12 Foreign#World# 128 44 47 0.026644463
## 8 Business#Crosswords/Games# 105 18 42 0.021856786
## 7 Business#BusinessDay#SmallBusiness 100 40 41 0.020815987
## 20 Travel#Travel# 83 34 35 0.017277269
## 3 #Opinion#RoomForDebate 42 20 20 0.008742714
## 17 Styles##Fashion 104 15 15 0.021648626
## 4 #Opinion#ThePublicEditor 16 4 10 0.003330558
## 21 myOther 33 5 5 0.006869276
## .freqRatio.OOB .freqRatio.Tst
## 1 0.2146990741 0.182887701
## 6 0.1869212963 0.162566845
## 11 0.1070601852 0.093048128
## 15 0.0515046296 0.087700535
## 9 0.0729166667 0.060962567
## 19 0.0584490741 0.056149733
## 5 0.0474537037 0.047593583
## 10 0.0005787037 0.037433155
## 14 0.0405092593 0.035828877
## 18 0.0289351852 0.032620321
## 16 0.0277777778 0.030481283
## 13 0.0306712963 0.029946524
## 2 0.0283564815 0.027807487
## 12 0.0254629630 0.025133690
## 8 0.0104166667 0.022459893
## 7 0.0231481481 0.021925134
## 20 0.0196759259 0.018716578
## 3 0.0115740741 0.010695187
## 17 0.0086805556 0.008021390
## 4 0.0023148148 0.005347594
## 21 0.0028935185 0.002673797
## [1] "glbObsAll: "
## [1] 8402 53
## [1] "glbObsTrn: "
## [1] 6532 53
## [1] "glbObsFit: "
## [1] 4804 52
## [1] "glbObsOOB: "
## [1] 1728 52
## [1] "glbObsNew: "
## [1] 1870 52
## Warning in rm(split): object 'split' not found
## label step_major step_minor label_minor bgn end
## 8 partition.data.training 4 0 0 47.818 49.295
## 9 select.features 5 0 0 49.295 NA
## elapsed
## 8 1.477
## 9 NA
5.0: select features## Warning in cor(data.matrix(entity_df[, sel_feats]), y =
## as.numeric(entity_df[, : the standard deviation is zero
## id cor.y
## Popular Popular 1.000000000
## WordCount.root2 WordCount.root2 0.292120679
## WordCount WordCount 0.257526549
## WordCount.log1p WordCount.log1p 0.254319628
## NDSSName.my.fctr NDSSName.my.fctr 0.165445970
## PubDate.day.minutes PubDate.day.minutes 0.156753478
## PubDate.day.minutes.poly.1 PubDate.day.minutes.poly.1 0.156753478
## PubDate.hour.fctr PubDate.hour.fctr 0.135436805
## PubDate.wkend PubDate.wkend 0.104707290
## PubDate.day.minutes.poly.4 PubDate.day.minutes.poly.4 0.073941394
## PubDate.day.minutes.poly.2 PubDate.day.minutes.poly.2 0.070977720
## PubDate.last4.log1p PubDate.last4.log1p 0.066473282
## PubDate.last2.log1p PubDate.last2.log1p 0.063068716
## PubDate.day.minutes.poly.5 PubDate.day.minutes.poly.5 -0.055929231
## PubDate.last8.log1p PubDate.last8.log1p 0.054458821
## WordCount.nexp WordCount.nexp -0.053208396
## PubDate.last16.log1p PubDate.last16.log1p 0.040735543
## PubDate.wkday.fctr PubDate.wkday.fctr -0.039801288
## PubDate.minute.fctr PubDate.minute.fctr -0.034073846
## PubDate.day.minutes.poly.3 PubDate.day.minutes.poly.3 0.027983551
## PubDate.zoo.ctg PubDate.zoo.ctg 0.022782795
## PubDate.month.fctr PubDate.month.fctr 0.019148739
## PubDate.POSIX PubDate.POSIX 0.015683258
## PubDate.last32.log1p.ctg PubDate.last32.log1p.ctg 0.015395971
## PubDate.day.minutes.poly.3.ctg PubDate.day.minutes.poly.3.ctg 0.014982807
## PubDate.hlday PubDate.hlday 0.014690122
## PubDate.day.minutes.poly.4.ctg PubDate.day.minutes.poly.4.ctg 0.014601521
## PubDate.day.minutes.poly.5.ctg PubDate.day.minutes.poly.5.ctg 0.014574775
## PubDate.juliandate PubDate.juliandate 0.014361075
## PubDate.zoo PubDate.zoo 0.013260902
## PubDate.second.fctr PubDate.second.fctr -0.011879458
## UniqueID UniqueID 0.011824920
## PubDate.date.fctr PubDate.date.fctr -0.011647558
## .rnorm .rnorm 0.008212201
## PubDate.last16.log1p.ctg PubDate.last16.log1p.ctg 0.007783530
## PubDate.last2.log1p.ctg PubDate.last2.log1p.ctg 0.006916600
## PubDate.last4.log1p.ctg PubDate.last4.log1p.ctg 0.004792781
## PubDate.last8.log1p.ctg PubDate.last8.log1p.ctg 0.003914960
## PubDate.day.minutes.poly.2.ctg PubDate.day.minutes.poly.2.ctg 0.003596414
## PubDate.last32.log1p PubDate.last32.log1p 0.003558081
## PubDate.day.minutes.poly.1.ctg PubDate.day.minutes.poly.1.ctg -0.002432289
## PubDate.year.fctr PubDate.year.fctr NA
## exclude.as.feat cor.y.abs
## Popular 1 1.000000000
## WordCount.root2 0 0.292120679
## WordCount 1 0.257526549
## WordCount.log1p 0 0.254319628
## NDSSName.my.fctr 0 0.165445970
## PubDate.day.minutes 1 0.156753478
## PubDate.day.minutes.poly.1 0 0.156753478
## PubDate.hour.fctr 0 0.135436805
## PubDate.wkend 0 0.104707290
## PubDate.day.minutes.poly.4 0 0.073941394
## PubDate.day.minutes.poly.2 0 0.070977720
## PubDate.last4.log1p 0 0.066473282
## PubDate.last2.log1p 0 0.063068716
## PubDate.day.minutes.poly.5 0 0.055929231
## PubDate.last8.log1p 0 0.054458821
## WordCount.nexp 0 0.053208396
## PubDate.last16.log1p 0 0.040735543
## PubDate.wkday.fctr 0 0.039801288
## PubDate.minute.fctr 0 0.034073846
## PubDate.day.minutes.poly.3 0 0.027983551
## PubDate.zoo.ctg 1 0.022782795
## PubDate.month.fctr 0 0.019148739
## PubDate.POSIX 1 0.015683258
## PubDate.last32.log1p.ctg 0 0.015395971
## PubDate.day.minutes.poly.3.ctg 0 0.014982807
## PubDate.hlday 0 0.014690122
## PubDate.day.minutes.poly.4.ctg 0 0.014601521
## PubDate.day.minutes.poly.5.ctg 0 0.014574775
## PubDate.juliandate 0 0.014361075
## PubDate.zoo 1 0.013260902
## PubDate.second.fctr 0 0.011879458
## UniqueID 1 0.011824920
## PubDate.date.fctr 0 0.011647558
## .rnorm 0 0.008212201
## PubDate.last16.log1p.ctg 0 0.007783530
## PubDate.last2.log1p.ctg 0 0.006916600
## PubDate.last4.log1p.ctg 0 0.004792781
## PubDate.last8.log1p.ctg 0 0.003914960
## PubDate.day.minutes.poly.2.ctg 0 0.003596414
## PubDate.last32.log1p 0 0.003558081
## PubDate.day.minutes.poly.1.ctg 0 0.002432289
## PubDate.year.fctr 0 NA
## [1] "cor(PubDate.juliandate, PubDate.month.fctr)=0.9393"
## [1] "cor(Popular.fctr, PubDate.juliandate)=0.0144"
## [1] "cor(Popular.fctr, PubDate.month.fctr)=0.0191"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified PubDate.juliandate as highly correlated with
## PubDate.month.fctr
## [1] "cor(PubDate.day.minutes.poly.1, PubDate.hour.fctr)=0.9026"
## [1] "cor(Popular.fctr, PubDate.day.minutes.poly.1)=0.1568"
## [1] "cor(Popular.fctr, PubDate.hour.fctr)=0.1354"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified PubDate.hour.fctr as highly correlated with
## PubDate.day.minutes.poly.1
## [1] "cor(WordCount.log1p, WordCount.root2)=0.8906"
## [1] "cor(Popular.fctr, WordCount.log1p)=0.2543"
## [1] "cor(Popular.fctr, WordCount.root2)=0.2921"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified WordCount.log1p as highly correlated with
## WordCount.root2
## [1] "cor(PubDate.last4.log1p, PubDate.last8.log1p)=0.8253"
## [1] "cor(Popular.fctr, PubDate.last4.log1p)=0.0665"
## [1] "cor(Popular.fctr, PubDate.last8.log1p)=0.0545"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified PubDate.last8.log1p as highly correlated with
## PubDate.last4.log1p
## [1] "cor(PubDate.last2.log1p, PubDate.last4.log1p)=0.7598"
## [1] "cor(Popular.fctr, PubDate.last2.log1p)=0.0631"
## [1] "cor(Popular.fctr, PubDate.last4.log1p)=0.0665"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified PubDate.last2.log1p as highly correlated with
## PubDate.last4.log1p
## id cor.y
## Popular Popular 1.000000000
## WordCount.root2 WordCount.root2 0.292120679
## WordCount WordCount 0.257526549
## WordCount.log1p WordCount.log1p 0.254319628
## NDSSName.my.fctr NDSSName.my.fctr 0.165445970
## PubDate.day.minutes PubDate.day.minutes 0.156753478
## PubDate.day.minutes.poly.1 PubDate.day.minutes.poly.1 0.156753478
## PubDate.hour.fctr PubDate.hour.fctr 0.135436805
## PubDate.wkend PubDate.wkend 0.104707290
## PubDate.day.minutes.poly.4 PubDate.day.minutes.poly.4 0.073941394
## PubDate.day.minutes.poly.2 PubDate.day.minutes.poly.2 0.070977720
## PubDate.last4.log1p PubDate.last4.log1p 0.066473282
## PubDate.last2.log1p PubDate.last2.log1p 0.063068716
## PubDate.last8.log1p PubDate.last8.log1p 0.054458821
## PubDate.last16.log1p PubDate.last16.log1p 0.040735543
## PubDate.day.minutes.poly.3 PubDate.day.minutes.poly.3 0.027983551
## PubDate.zoo.ctg PubDate.zoo.ctg 0.022782795
## PubDate.month.fctr PubDate.month.fctr 0.019148739
## PubDate.POSIX PubDate.POSIX 0.015683258
## PubDate.last32.log1p.ctg PubDate.last32.log1p.ctg 0.015395971
## PubDate.day.minutes.poly.3.ctg PubDate.day.minutes.poly.3.ctg 0.014982807
## PubDate.hlday PubDate.hlday 0.014690122
## PubDate.day.minutes.poly.4.ctg PubDate.day.minutes.poly.4.ctg 0.014601521
## PubDate.day.minutes.poly.5.ctg PubDate.day.minutes.poly.5.ctg 0.014574775
## PubDate.juliandate PubDate.juliandate 0.014361075
## PubDate.zoo PubDate.zoo 0.013260902
## UniqueID UniqueID 0.011824920
## .rnorm .rnorm 0.008212201
## PubDate.last16.log1p.ctg PubDate.last16.log1p.ctg 0.007783530
## PubDate.last2.log1p.ctg PubDate.last2.log1p.ctg 0.006916600
## PubDate.last4.log1p.ctg PubDate.last4.log1p.ctg 0.004792781
## PubDate.last8.log1p.ctg PubDate.last8.log1p.ctg 0.003914960
## PubDate.day.minutes.poly.2.ctg PubDate.day.minutes.poly.2.ctg 0.003596414
## PubDate.last32.log1p PubDate.last32.log1p 0.003558081
## PubDate.day.minutes.poly.1.ctg PubDate.day.minutes.poly.1.ctg -0.002432289
## PubDate.date.fctr PubDate.date.fctr -0.011647558
## PubDate.second.fctr PubDate.second.fctr -0.011879458
## PubDate.minute.fctr PubDate.minute.fctr -0.034073846
## PubDate.wkday.fctr PubDate.wkday.fctr -0.039801288
## WordCount.nexp WordCount.nexp -0.053208396
## PubDate.day.minutes.poly.5 PubDate.day.minutes.poly.5 -0.055929231
## PubDate.year.fctr PubDate.year.fctr NA
## exclude.as.feat cor.y.abs
## Popular 1 1.000000000
## WordCount.root2 0 0.292120679
## WordCount 1 0.257526549
## WordCount.log1p 0 0.254319628
## NDSSName.my.fctr 0 0.165445970
## PubDate.day.minutes 1 0.156753478
## PubDate.day.minutes.poly.1 0 0.156753478
## PubDate.hour.fctr 0 0.135436805
## PubDate.wkend 0 0.104707290
## PubDate.day.minutes.poly.4 0 0.073941394
## PubDate.day.minutes.poly.2 0 0.070977720
## PubDate.last4.log1p 0 0.066473282
## PubDate.last2.log1p 0 0.063068716
## PubDate.last8.log1p 0 0.054458821
## PubDate.last16.log1p 0 0.040735543
## PubDate.day.minutes.poly.3 0 0.027983551
## PubDate.zoo.ctg 1 0.022782795
## PubDate.month.fctr 0 0.019148739
## PubDate.POSIX 1 0.015683258
## PubDate.last32.log1p.ctg 0 0.015395971
## PubDate.day.minutes.poly.3.ctg 0 0.014982807
## PubDate.hlday 0 0.014690122
## PubDate.day.minutes.poly.4.ctg 0 0.014601521
## PubDate.day.minutes.poly.5.ctg 0 0.014574775
## PubDate.juliandate 0 0.014361075
## PubDate.zoo 1 0.013260902
## UniqueID 1 0.011824920
## .rnorm 0 0.008212201
## PubDate.last16.log1p.ctg 0 0.007783530
## PubDate.last2.log1p.ctg 0 0.006916600
## PubDate.last4.log1p.ctg 0 0.004792781
## PubDate.last8.log1p.ctg 0 0.003914960
## PubDate.day.minutes.poly.2.ctg 0 0.003596414
## PubDate.last32.log1p 0 0.003558081
## PubDate.day.minutes.poly.1.ctg 0 0.002432289
## PubDate.date.fctr 0 0.011647558
## PubDate.second.fctr 0 0.011879458
## PubDate.minute.fctr 0 0.034073846
## PubDate.wkday.fctr 0 0.039801288
## WordCount.nexp 0 0.053208396
## PubDate.day.minutes.poly.5 0 0.055929231
## PubDate.year.fctr 0 NA
## cor.high.X freqRatio
## Popular <NA> 4.976212
## WordCount.root2 <NA> 2.315789
## WordCount <NA> 2.315789
## WordCount.log1p WordCount.root2 2.315789
## NDSSName.my.fctr <NA> 1.348739
## PubDate.day.minutes <NA> 1.225490
## PubDate.day.minutes.poly.1 <NA> 1.225490
## PubDate.hour.fctr PubDate.day.minutes.poly.1 1.835040
## PubDate.wkend <NA> 12.011952
## PubDate.day.minutes.poly.4 <NA> 1.225490
## PubDate.day.minutes.poly.2 <NA> 1.225490
## PubDate.last4.log1p <NA> 1.125000
## PubDate.last2.log1p PubDate.last4.log1p 1.375000
## PubDate.last8.log1p PubDate.last4.log1p 1.142857
## PubDate.last16.log1p <NA> 3.200000
## PubDate.day.minutes.poly.3 <NA> 1.225490
## PubDate.zoo.ctg <NA> 1.000000
## PubDate.month.fctr <NA> 1.017514
## PubDate.POSIX <NA> 1.000000
## PubDate.last32.log1p.ctg <NA> 239.000000
## PubDate.day.minutes.poly.3.ctg <NA> 1.083333
## PubDate.hlday <NA> 28.160714
## PubDate.day.minutes.poly.4.ctg <NA> 1.083333
## PubDate.day.minutes.poly.5.ctg <NA> 1.083333
## PubDate.juliandate PubDate.month.fctr 1.032520
## PubDate.zoo <NA> 1.000000
## UniqueID <NA> 1.000000
## .rnorm <NA> 1.000000
## PubDate.last16.log1p.ctg <NA> 60.000000
## PubDate.last2.log1p.ctg <NA> 5.000000
## PubDate.last4.log1p.ctg <NA> 20.000000
## PubDate.last8.log1p.ctg <NA> 40.000000
## PubDate.day.minutes.poly.2.ctg <NA> 1.083333
## PubDate.last32.log1p <NA> 8.000000
## PubDate.day.minutes.poly.1.ctg <NA> 1.083333
## PubDate.date.fctr <NA> 1.021394
## PubDate.second.fctr <NA> 1.018204
## PubDate.minute.fctr <NA> 1.483365
## PubDate.wkday.fctr <NA> 1.003268
## WordCount.nexp <NA> 17.761364
## PubDate.day.minutes.poly.5 <NA> 1.225490
## PubDate.year.fctr <NA> 0.000000
## percentUnique zeroVar nzv
## Popular 0.03061849 FALSE FALSE
## WordCount.root2 24.15799143 FALSE FALSE
## WordCount 24.15799143 FALSE FALSE
## WordCount.log1p 24.15799143 FALSE FALSE
## NDSSName.my.fctr 0.32149418 FALSE FALSE
## PubDate.day.minutes 18.08022045 FALSE FALSE
## PubDate.day.minutes.poly.1 18.08022045 FALSE FALSE
## PubDate.hour.fctr 0.04592774 FALSE FALSE
## PubDate.wkend 0.03061849 FALSE FALSE
## PubDate.day.minutes.poly.4 18.08022045 FALSE FALSE
## PubDate.day.minutes.poly.2 18.08022045 FALSE FALSE
## PubDate.last4.log1p 64.98775260 FALSE FALSE
## PubDate.last2.log1p 51.17881200 FALSE FALSE
## PubDate.last8.log1p 75.12247397 FALSE FALSE
## PubDate.last16.log1p 84.44580527 FALSE FALSE
## PubDate.day.minutes.poly.3 18.08022045 FALSE FALSE
## PubDate.zoo.ctg 99.92345377 FALSE FALSE
## PubDate.month.fctr 0.04592774 FALSE FALSE
## PubDate.POSIX 99.86221678 FALSE FALSE
## PubDate.last32.log1p.ctg 92.11573791 FALSE FALSE
## PubDate.day.minutes.poly.3.ctg 53.96509492 FALSE FALSE
## PubDate.hlday 0.03061849 FALSE TRUE
## PubDate.day.minutes.poly.4.ctg 53.94978567 FALSE FALSE
## PubDate.day.minutes.poly.5.ctg 53.94978567 FALSE FALSE
## PubDate.juliandate 1.39314146 FALSE FALSE
## PubDate.zoo 99.86221678 FALSE FALSE
## UniqueID 100.00000000 FALSE FALSE
## .rnorm 100.00000000 FALSE FALSE
## PubDate.last16.log1p.ctg 95.17758726 FALSE FALSE
## PubDate.last2.log1p.ctg 92.19228414 FALSE FALSE
## PubDate.last4.log1p.ctg 95.88181261 FALSE FALSE
## PubDate.last8.log1p.ctg 96.41763625 FALSE FALSE
## PubDate.day.minutes.poly.2.ctg 53.94978567 FALSE FALSE
## PubDate.last32.log1p 90.99816289 FALSE FALSE
## PubDate.day.minutes.poly.1.ctg 53.96509492 FALSE FALSE
## PubDate.date.fctr 0.07654623 FALSE FALSE
## PubDate.second.fctr 0.06123699 FALSE FALSE
## PubDate.minute.fctr 0.06123699 FALSE FALSE
## PubDate.wkday.fctr 0.10716473 FALSE FALSE
## WordCount.nexp 11.32884262 FALSE FALSE
## PubDate.day.minutes.poly.5 18.08022045 FALSE FALSE
## PubDate.year.fctr 0.01530925 TRUE TRUE
## is.cor.y.abs.low
## Popular FALSE
## WordCount.root2 FALSE
## WordCount FALSE
## WordCount.log1p FALSE
## NDSSName.my.fctr FALSE
## PubDate.day.minutes FALSE
## PubDate.day.minutes.poly.1 FALSE
## PubDate.hour.fctr FALSE
## PubDate.wkend FALSE
## PubDate.day.minutes.poly.4 FALSE
## PubDate.day.minutes.poly.2 FALSE
## PubDate.last4.log1p FALSE
## PubDate.last2.log1p FALSE
## PubDate.last8.log1p FALSE
## PubDate.last16.log1p FALSE
## PubDate.day.minutes.poly.3 FALSE
## PubDate.zoo.ctg FALSE
## PubDate.month.fctr FALSE
## PubDate.POSIX FALSE
## PubDate.last32.log1p.ctg FALSE
## PubDate.day.minutes.poly.3.ctg FALSE
## PubDate.hlday FALSE
## PubDate.day.minutes.poly.4.ctg FALSE
## PubDate.day.minutes.poly.5.ctg FALSE
## PubDate.juliandate FALSE
## PubDate.zoo FALSE
## UniqueID FALSE
## .rnorm FALSE
## PubDate.last16.log1p.ctg TRUE
## PubDate.last2.log1p.ctg TRUE
## PubDate.last4.log1p.ctg TRUE
## PubDate.last8.log1p.ctg TRUE
## PubDate.day.minutes.poly.2.ctg TRUE
## PubDate.last32.log1p TRUE
## PubDate.day.minutes.poly.1.ctg TRUE
## PubDate.date.fctr FALSE
## PubDate.second.fctr FALSE
## PubDate.minute.fctr FALSE
## PubDate.wkday.fctr FALSE
## WordCount.nexp FALSE
## PubDate.day.minutes.poly.5 FALSE
## PubDate.year.fctr NA
## Warning in myplot_scatter(plt_feats_df, "percentUnique", "freqRatio",
## colorcol_name = "nzv", : converting nzv to class:factor
## Warning: Removed 20 rows containing missing values (geom_point).
## Warning: Removed 20 rows containing missing values (geom_point).
## Warning: Removed 20 rows containing missing values (geom_point).
## id cor.y exclude.as.feat cor.y.abs
## PubDate.hlday PubDate.hlday 0.01469012 0 0.01469012
## PubDate.year.fctr PubDate.year.fctr NA 0 NA
## cor.high.X freqRatio percentUnique zeroVar nzv
## PubDate.hlday <NA> 28.16071 0.03061849 FALSE TRUE
## PubDate.year.fctr <NA> 0.00000 0.01530925 TRUE TRUE
## is.cor.y.abs.low
## PubDate.hlday FALSE
## PubDate.year.fctr NA
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## +(rfe) fit Fold1.Rep1 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold1.Rep1 size: 60
## +(rfe) imp Fold1.Rep1
## -(rfe) imp Fold1.Rep1
## +(rfe) fit Fold1.Rep1 size: 32
## -(rfe) fit Fold1.Rep1 size: 32
## +(rfe) fit Fold1.Rep1 size: 16
## -(rfe) fit Fold1.Rep1 size: 16
## +(rfe) fit Fold1.Rep1 size: 8
## -(rfe) fit Fold1.Rep1 size: 8
## +(rfe) fit Fold1.Rep1 size: 4
## -(rfe) fit Fold1.Rep1 size: 4
## +(rfe) fit Fold1.Rep1 size: 2
## -(rfe) fit Fold1.Rep1 size: 2
## +(rfe) fit Fold2.Rep1 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold2.Rep1 size: 60
## +(rfe) imp Fold2.Rep1
## -(rfe) imp Fold2.Rep1
## +(rfe) fit Fold2.Rep1 size: 32
## -(rfe) fit Fold2.Rep1 size: 32
## +(rfe) fit Fold2.Rep1 size: 16
## -(rfe) fit Fold2.Rep1 size: 16
## +(rfe) fit Fold2.Rep1 size: 8
## -(rfe) fit Fold2.Rep1 size: 8
## +(rfe) fit Fold2.Rep1 size: 4
## -(rfe) fit Fold2.Rep1 size: 4
## +(rfe) fit Fold2.Rep1 size: 2
## -(rfe) fit Fold2.Rep1 size: 2
## +(rfe) fit Fold3.Rep1 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold3.Rep1 size: 60
## +(rfe) imp Fold3.Rep1
## -(rfe) imp Fold3.Rep1
## +(rfe) fit Fold3.Rep1 size: 32
## -(rfe) fit Fold3.Rep1 size: 32
## +(rfe) fit Fold3.Rep1 size: 16
## -(rfe) fit Fold3.Rep1 size: 16
## +(rfe) fit Fold3.Rep1 size: 8
## -(rfe) fit Fold3.Rep1 size: 8
## +(rfe) fit Fold3.Rep1 size: 4
## -(rfe) fit Fold3.Rep1 size: 4
## +(rfe) fit Fold3.Rep1 size: 2
## -(rfe) fit Fold3.Rep1 size: 2
## +(rfe) fit Fold1.Rep2 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold1.Rep2 size: 60
## +(rfe) imp Fold1.Rep2
## -(rfe) imp Fold1.Rep2
## +(rfe) fit Fold1.Rep2 size: 32
## -(rfe) fit Fold1.Rep2 size: 32
## +(rfe) fit Fold1.Rep2 size: 16
## -(rfe) fit Fold1.Rep2 size: 16
## +(rfe) fit Fold1.Rep2 size: 8
## -(rfe) fit Fold1.Rep2 size: 8
## +(rfe) fit Fold1.Rep2 size: 4
## -(rfe) fit Fold1.Rep2 size: 4
## +(rfe) fit Fold1.Rep2 size: 2
## -(rfe) fit Fold1.Rep2 size: 2
## +(rfe) fit Fold2.Rep2 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold2.Rep2 size: 60
## +(rfe) imp Fold2.Rep2
## -(rfe) imp Fold2.Rep2
## +(rfe) fit Fold2.Rep2 size: 32
## -(rfe) fit Fold2.Rep2 size: 32
## +(rfe) fit Fold2.Rep2 size: 16
## -(rfe) fit Fold2.Rep2 size: 16
## +(rfe) fit Fold2.Rep2 size: 8
## -(rfe) fit Fold2.Rep2 size: 8
## +(rfe) fit Fold2.Rep2 size: 4
## -(rfe) fit Fold2.Rep2 size: 4
## +(rfe) fit Fold2.Rep2 size: 2
## -(rfe) fit Fold2.Rep2 size: 2
## +(rfe) fit Fold3.Rep2 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold3.Rep2 size: 60
## +(rfe) imp Fold3.Rep2
## -(rfe) imp Fold3.Rep2
## +(rfe) fit Fold3.Rep2 size: 32
## -(rfe) fit Fold3.Rep2 size: 32
## +(rfe) fit Fold3.Rep2 size: 16
## -(rfe) fit Fold3.Rep2 size: 16
## +(rfe) fit Fold3.Rep2 size: 8
## -(rfe) fit Fold3.Rep2 size: 8
## +(rfe) fit Fold3.Rep2 size: 4
## -(rfe) fit Fold3.Rep2 size: 4
## +(rfe) fit Fold3.Rep2 size: 2
## -(rfe) fit Fold3.Rep2 size: 2
## +(rfe) fit Fold1.Rep3 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold1.Rep3 size: 60
## +(rfe) imp Fold1.Rep3
## -(rfe) imp Fold1.Rep3
## +(rfe) fit Fold1.Rep3 size: 32
## -(rfe) fit Fold1.Rep3 size: 32
## +(rfe) fit Fold1.Rep3 size: 16
## -(rfe) fit Fold1.Rep3 size: 16
## +(rfe) fit Fold1.Rep3 size: 8
## -(rfe) fit Fold1.Rep3 size: 8
## +(rfe) fit Fold1.Rep3 size: 4
## -(rfe) fit Fold1.Rep3 size: 4
## +(rfe) fit Fold1.Rep3 size: 2
## -(rfe) fit Fold1.Rep3 size: 2
## +(rfe) fit Fold2.Rep3 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold2.Rep3 size: 60
## +(rfe) imp Fold2.Rep3
## -(rfe) imp Fold2.Rep3
## +(rfe) fit Fold2.Rep3 size: 32
## -(rfe) fit Fold2.Rep3 size: 32
## +(rfe) fit Fold2.Rep3 size: 16
## -(rfe) fit Fold2.Rep3 size: 16
## +(rfe) fit Fold2.Rep3 size: 8
## -(rfe) fit Fold2.Rep3 size: 8
## +(rfe) fit Fold2.Rep3 size: 4
## -(rfe) fit Fold2.Rep3 size: 4
## +(rfe) fit Fold2.Rep3 size: 2
## -(rfe) fit Fold2.Rep3 size: 2
## +(rfe) fit Fold3.Rep3 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold3.Rep3 size: 60
## +(rfe) imp Fold3.Rep3
## -(rfe) imp Fold3.Rep3
## +(rfe) fit Fold3.Rep3 size: 32
## -(rfe) fit Fold3.Rep3 size: 32
## +(rfe) fit Fold3.Rep3 size: 16
## -(rfe) fit Fold3.Rep3 size: 16
## +(rfe) fit Fold3.Rep3 size: 8
## -(rfe) fit Fold3.Rep3 size: 8
## +(rfe) fit Fold3.Rep3 size: 4
## -(rfe) fit Fold3.Rep3 size: 4
## +(rfe) fit Fold3.Rep3 size: 2
## -(rfe) fit Fold3.Rep3 size: 2
## Warning in lda.default(x, grouping, ...): variables are collinear
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (3 fold, repeated 3 times)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 2 0.8096 0.03708 0.005112 0.01788
## 4 0.8864 0.52518 0.005536 0.02625
## 8 0.8945 0.57850 0.013874 0.06859
## 16 0.9304 0.75912 0.004608 0.01694
## 32 0.9305 0.75960 0.004552 0.01678
## 60 0.9326 0.76882 0.004814 0.01705 *
##
## The top 5 variables (out of 60):
## WordCount.log1p, WordCount.root2, WordCount.nexp, NDSSName.my.fctrOpEd#Opinion#, PubDate.day.minutes.poly.1
##
## [1] "WordCount.log1p"
## [2] "WordCount.root2"
## [3] "WordCount.nexp"
## [4] "NDSSName.my.fctrOpEd#Opinion#"
## [5] "PubDate.day.minutes.poly.1"
## [6] "PubDate.day.minutes.poly.4"
## [7] "PubDate.hour.fctr(15.3,23]"
## [8] "NDSSName.my.fctrScience#Health#"
## [9] "PubDate.last4.log1p"
## [10] "PubDate.last2.log1p"
## [11] "NDSSName.my.fctrBusiness#Crosswords/Games#"
## [12] "NDSSName.my.fctrStyles#U.S.#"
## [13] "PubDate.last8.log1p"
## [14] "PubDate.day.minutes.poly.5"
## [15] "PubDate.wkend"
## [16] "NDSSName.my.fctr##:PubDate.day.minutes.poly.2.ctg"
## [17] "PubDate.last16.log1p"
## [18] "PubDate.juliandate"
## [19] "PubDate.month.fctr11"
## [20] "PubDate.day.minutes.poly.3"
## [21] "PubDate.wkday.fctr6"
## [22] "PubDate.date.fctr(7,13]"
## [23] "PubDate.second.fctr(14.8,29.5]"
## [24] "PubDate.wkday.fctr1"
## [25] "PubDate.month.fctr10"
## [26] ".rnorm"
## [27] "PubDate.last32.log1p"
## [28] "NDSSName.my.fctr##:PubDate.day.minutes.poly.4.ctg"
## [29] "PubDate.minute.fctr(44.2,59.1]"
## [30] "NDSSName.my.fctr##:PubDate.day.minutes.poly.5.ctg"
## [31] "NDSSName.my.fctr##:PubDate.day.minutes.poly.1.ctg"
## [32] "PubDate.day.minutes.poly.2"
## [33] "PubDate.hour.fctr(7.67,15.3]"
## [34] "PubDate.date.fctr(25,31]"
## [35] "PubDate.minute.fctr(14.8,29.5]"
## [36] "PubDate.second.fctr(44.2,59.1]"
## [37] "PubDate.wkday.fctr3"
## [38] "NDSSName.my.fctr##:PubDate.day.minutes.poly.3.ctg"
## [39] "NDSSName.my.fctrmyOther"
## [40] "NDSSName.my.fctr#Opinion#RoomForDebate"
## [41] "PubDate.date.fctr(19,25]"
## [42] "NDSSName.my.fctrBusiness#Technology#"
## [43] "PubDate.wkday.fctr4"
## [44] "PubDate.second.fctr(29.5,44.2]"
## [45] "PubDate.date.fctr(13,19]"
## [46] "NDSSName.my.fctrMetro#N.Y./Region#"
## [47] "NDSSName.my.fctrTravel#Travel#"
## [48] "NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness"
## [49] "NDSSName.my.fctr#Multimedia#"
## [50] "PubDate.wkday.fctr2"
## [51] "NDSSName.my.fctrStyles##Fashion"
## [52] "NDSSName.my.fctrForeign#World#"
## [53] "PubDate.minute.fctr(29.5,44.2]"
## [54] "NDSSName.my.fctrForeign#World#AsiaPacific"
## [55] "PubDate.wkday.fctr5"
## [56] "NDSSName.my.fctr#U.S.#Education"
## [57] "NDSSName.my.fctrCulture#Arts#"
## [58] "NDSSName.my.fctrBusiness#BusinessDay#Dealbook"
## [59] "NDSSName.my.fctr##"
## [60] "NDSSName.my.fctrTStyle##"
## [1] "numeric data missing in : "
## Popular Popular.fctr
## 1870 1870
## [1] "numeric data w/ 0s in : "
## WordCount Popular WordCount.log1p
## 109 5439 109
## WordCount.root2 WordCount.nexp PubDate.wkday.fctr
## 109 2044 378
## PubDate.wkend PubDate.hlday PubDate.day.minutes
## 7787 8160 5
## PubDate.last2.log1p PubDate.last4.log1p PubDate.last8.log1p
## 2 4 8
## PubDate.last16.log1p PubDate.last32.log1p PubDate.last2.log1p.ctg
## 16 32 42
## PubDate.last4.log1p.ctg PubDate.last8.log1p.ctg PubDate.last16.log1p.ctg
## 84 168 336
## PubDate.last32.log1p.ctg
## 670
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## NewsDesk SectionName SubsectionName Headline Snippet
## 2408 2899 6176 0 13
## Abstract PubDate NDSSName.my .lcn
## 17 0 0 1870
## [1] "glb_feats_df:"
## [1] 42 12
## id exclude.as.feat rsp_var
## Popular.fctr Popular.fctr TRUE TRUE
## id cor.y exclude.as.feat cor.y.abs cor.high.X
## Popular Popular 1.00000000 TRUE 1.00000000 <NA>
## UniqueID UniqueID 0.01182492 TRUE 0.01182492 <NA>
## Popular.fctr Popular.fctr NA TRUE NA <NA>
## freqRatio percentUnique zeroVar nzv is.cor.y.abs.low
## Popular 4.976212 0.03061849 FALSE FALSE FALSE
## UniqueID 1.000000 100.00000000 FALSE FALSE FALSE
## Popular.fctr NA NA NA NA NA
## interaction.feat shapiro.test.p.value rsp_var_raw id_var
## Popular <NA> NA TRUE NA
## UniqueID <NA> NA FALSE TRUE
## Popular.fctr <NA> NA NA NA
## rsp_var
## Popular NA
## UniqueID NA
## Popular.fctr TRUE
## [1] "glb_feats_df vs. glbObsAll: "
## character(0)
## [1] "glbObsAll vs. glb_feats_df: "
## character(0)
## label step_major step_minor label_minor bgn end elapsed
## 9 select.features 5 0 0 49.295 74.755 25.46
## 10 fit.models 6 0 0 74.756 NA NA
6.0: fit models# load(paste0(glb_out_pfx, "dsk.RData"))
get_model_sel_frmla <- function() {
model_evl_terms <- c(NULL)
# min.aic.fit might not be avl
lclMdlEvlCriteria <-
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)]
for (metric in lclMdlEvlCriteria)
model_evl_terms <- c(model_evl_terms,
ifelse(length(grep("max", metric)) > 0, "-", "+"), metric)
if (glb_is_classification && glb_is_binomial)
model_evl_terms <- c(model_evl_terms, "-", "opt.prob.threshold.OOB")
model_sel_frmla <- as.formula(paste(c("~ ", model_evl_terms), collapse = " "))
return(model_sel_frmla)
}
get_dsp_models_df <- function() {
dsp_models_cols <- c("id",
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)],
grep("opt.", names(glb_models_df), fixed = TRUE, value = TRUE))
dsp_models_df <-
#orderBy(get_model_sel_frmla(), glb_models_df)[, c("id", glbMdlMetricsEval)]
orderBy(get_model_sel_frmla(), glb_models_df)[, dsp_models_cols]
nCvMdl <- sapply(glb_models_lst, function(mdl) nrow(mdl$results))
nParams <- sapply(glb_models_lst, function(mdl) ifelse(mdl$method == "custom", 0,
nrow(subset(modelLookup(mdl$method), parameter != "parameter"))))
# nCvMdl <- nCvMdl[names(nCvMdl) != "avNNet"]
# nParams <- nParams[names(nParams) != "avNNet"]
if (length(cvMdlProblems <- nCvMdl[nCvMdl <= nParams]) > 0) {
print("Cross Validation issues:")
warning("Cross Validation issues:")
print(cvMdlProblems)
}
pltMdls <- setdiff(names(nCvMdl), names(cvMdlProblems))
pltMdls <- setdiff(pltMdls, names(nParams[nParams == 0]))
# length(pltMdls) == 21
png(paste0(glb_out_pfx, "bestTune.png"), width = 480 * 2, height = 480 * 4)
grid.newpage()
pushViewport(viewport(layout = grid.layout(ceiling(length(pltMdls) / 2.0), 2)))
pltIx <- 1
for (mdlId in pltMdls) {
print(ggplot(glb_models_lst[[mdlId]], highBestTune = TRUE) + labs(title = mdlId),
vp = viewport(layout.pos.row = ceiling(pltIx / 2.0),
layout.pos.col = ((pltIx - 1) %% 2) + 1))
pltIx <- pltIx + 1
}
dev.off()
return(dsp_models_df)
}
#get_dsp_models_df()
if (glb_is_classification && glb_is_binomial &&
(length(unique(glbObsFit[, glb_rsp_var])) < 2))
stop("glbObsFit$", glb_rsp_var, ": contains less than 2 unique values: ",
paste0(unique(glbObsFit[, glb_rsp_var]), collapse=", "))
max_cor_y_x_vars <- orderBy(~ -cor.y.abs,
subset(glb_feats_df, (exclude.as.feat == 0) & !nzv & !is.cor.y.abs.low &
is.na(cor.high.X)))[1:2, "id"]
max_cor_y_x_vars <- max_cor_y_x_vars[!is.na(max_cor_y_x_vars)]
if (!is.null(glb_Baseline_mdl_var)) {
if ((max_cor_y_x_vars[1] != glb_Baseline_mdl_var) &
(glb_feats_df[glb_feats_df$id == max_cor_y_x_vars[1], "cor.y.abs"] >
glb_feats_df[glb_feats_df$id == glb_Baseline_mdl_var, "cor.y.abs"]))
stop(max_cor_y_x_vars[1], " has a higher correlation with ", glb_rsp_var,
" than the Baseline var: ", glb_Baseline_mdl_var)
}
glb_model_type <- ifelse(glb_is_regression, "regression", "classification")
# Model specs
c("id.prefix", "method", "type",
# trainControl params
"preProc.method", "cv.n.folds", "cv.n.repeats", "summary.fn",
# train params
"metric", "metric.maximize", "tune.df")
## [1] "id.prefix" "method" "type"
## [4] "preProc.method" "cv.n.folds" "cv.n.repeats"
## [7] "summary.fn" "metric" "metric.maximize"
## [10] "tune.df"
# Baseline
if (!is.null(glb_Baseline_mdl_var))
ret_lst <- myfit_mdl(mdl_id="Baseline",
model_method="mybaseln_classfr",
indep_vars_vctr=glb_Baseline_mdl_var,
rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
# Most Frequent Outcome "MFO" model: mean(y) for regression
# Not using caret's nullModel since model stats not avl
# Cannot use rpart for multinomial classification since it predicts non-MFO
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "MFO", type = glb_model_type, trainControl.method = "none",
train.method = ifelse(glb_is_regression, "lm", "myMFO_classfr"))),
indep_vars = ".rnorm", rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
## [1] "fitting model: MFO###myMFO_classfr"
## [1] " indep_vars: .rnorm"
## Fitting parameter = none on full training set
## [1] "in MFO.Classifier$fit"
## [1] "unique.vals:"
## [1] N Y
## Levels: N Y
## [1] "unique.prob:"
## y
## N Y
## 0.820358 0.179642
## [1] "MFO.val:"
## [1] "N"
## Length Class Mode
## unique.vals 2 factor numeric
## unique.prob 2 -none- numeric
## MFO.val 1 -none- character
## x.names 1 -none- character
## xNames 1 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] "entr MFO.Classifier$predict"
## [1] "exit MFO.Classifier$predict"
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
## [1] "in MFO.Classifier$prob"
## N Y
## 1 0.820358 0.179642
## 2 0.820358 0.179642
## 3 0.820358 0.179642
## 4 0.820358 0.179642
## 5 0.820358 0.179642
## 6 0.820358 0.179642
## Prediction
## Reference N Y
## N 0 3941
## Y 0 863
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.1796420 0.0000000 0.1688795 0.1907952 0.8203580
## AccuracyPValue McnemarPValue
## 1.0000000 0.0000000
## [1] "entr MFO.Classifier$predict"
## [1] "exit MFO.Classifier$predict"
## [1] "in MFO.Classifier$prob"
## N Y
## 1 0.820358 0.179642
## 2 0.820358 0.179642
## 3 0.820358 0.179642
## 4 0.820358 0.179642
## 5 0.820358 0.179642
## 6 0.820358 0.179642
## Prediction
## Reference N Y
## N 0 1498
## Y 0 230
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.1331019 0.0000000 0.1174298 0.1500310 0.8668981
## AccuracyPValue McnemarPValue
## 1.0000000 0.0000000
## id feats max.nTuningRuns min.elapsedtime.everything
## 1 MFO###myMFO_classfr .rnorm 0 0.296
## min.elapsedtime.final max.AUCpROC.fit max.Sens.fit max.Spec.fit
## 1 0.004 0.5 1 0
## max.AUCROCR.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.1 0.3045703 0.179642
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.1688795 0.1907952 0
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5 1 0 0.5
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.2349336 0.1331019
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.1174298 0.150031 0
if (glb_is_classification)
# "random" model - only for classification;
# none needed for regression since it is same as MFO
#stop(here"); glb2Sav(); all.equal(glb_models_df, sav_models_df)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Random", type = glb_model_type, trainControl.method = "none",
train.method = "myrandom_classfr")),
indep_vars = ".rnorm", rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
## [1] "fitting model: Random###myrandom_classfr"
## [1] " indep_vars: .rnorm"
## Fitting parameter = none on full training set
## Length Class Mode
## unique.vals 2 factor numeric
## unique.prob 2 table numeric
## xNames 1 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] "in Random.Classifier$prob"
## Prediction
## Reference N Y
## N 0 3941
## Y 0 863
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.1796420 0.0000000 0.1688795 0.1907952 0.8203580
## AccuracyPValue McnemarPValue
## 1.0000000 0.0000000
## [1] "in Random.Classifier$prob"
## Prediction
## Reference N Y
## N 0 1498
## Y 0 230
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.1331019 0.0000000 0.1174298 0.1500310 0.8668981
## AccuracyPValue McnemarPValue
## 1.0000000 0.0000000
## id feats max.nTuningRuns
## 1 Random###myrandom_classfr .rnorm 0
## min.elapsedtime.everything min.elapsedtime.final max.AUCpROC.fit
## 1 0.301 0.002 0.4990604
## max.Sens.fit max.Spec.fit max.AUCROCR.fit opt.prob.threshold.fit
## 1 0.8312611 0.1668598 0.4972757 0.1
## max.f.score.fit max.Accuracy.fit max.AccuracyLower.fit
## 1 0.3045703 0.179642 0.1688795
## max.AccuracyUpper.fit max.Kappa.fit max.AUCpROC.OOB max.Sens.OOB
## 1 0.1907952 0 0.5125675 0.8077437
## max.Spec.OOB max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB
## 1 0.2173913 0.4857956 0.1 0.2349336
## max.Accuracy.OOB max.AccuracyLower.OOB max.AccuracyUpper.OOB
## 1 0.1331019 0.1174298 0.150031
## max.Kappa.OOB
## 1 0
# ret_lst <- myfit_mdl(mdl_id = "Random", model_method = "myrandom_classfr",
# model_type = glb_model_type,
# indep_vars_vctr = ".rnorm",
# rsp_var = glb_rsp_var,
# fit_df = glbObsFit, OOB_df = glbObsOOB)
# Max.cor.Y
# Check impact of cv
# rpart is not a good candidate since caret does not optimize cp (only tuning parameter of rpart) well
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Max.cor.Y.rcv.1X1", type=glb_model_type, trainControl.method="none",
train.method="glmnet")),
indep_vars=max_cor_y_x_vars, rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
## [1] "fitting model: Max.cor.Y.rcv.1X1###glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## Loading required package: glmnet
## Loading required package: Matrix
## Loaded glmnet 2.0-2
## Fitting alpha = 0.1, lambda = 0.00434 on full training set
## Length Class Mode
## a0 100 -none- numeric
## beta 2100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 21 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -4.57159198
## NDSSName.my.fctr#Multimedia#
## -1.22219085
## NDSSName.my.fctr#Opinion#RoomForDebate
## -3.46072453
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 4.06871185
## NDSSName.my.fctr#U.S.#Education
## -1.89443632
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.22472818
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.95537118
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 4.55408513
## NDSSName.my.fctrBusiness#Technology#
## 0.77368538
## NDSSName.my.fctrCulture#Arts#
## -0.09465691
## NDSSName.my.fctrForeign#World#
## -1.45528874
## NDSSName.my.fctrForeign#World#AsiaPacific
## -1.60117505
## NDSSName.my.fctrMetro#N.Y./Region#
## 0.01563989
## NDSSName.my.fctrOpEd#Opinion#
## 4.51696382
## NDSSName.my.fctrScience#Health#
## 3.51595317
## NDSSName.my.fctrStyles##Fashion
## -1.85948925
## NDSSName.my.fctrStyles#U.S.#
## 3.27995325
## NDSSName.my.fctrTStyle##
## -1.54110404
## NDSSName.my.fctrTravel#Travel#
## -1.41940605
## NDSSName.my.fctrmyOther
## -1.90156922
## WordCount.root2
## 0.08434378
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -4.60394059
## NDSSName.my.fctr#Multimedia#
## -1.25163328
## NDSSName.my.fctr#Opinion#RoomForDebate
## -3.55521332
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 4.09217313
## NDSSName.my.fctr#U.S.#Education
## -1.96172971
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.22495986
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.96836050
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 4.58120497
## NDSSName.my.fctrBusiness#Technology#
## 0.78504703
## NDSSName.my.fctrCulture#Arts#
## -0.09069661
## NDSSName.my.fctrForeign#World#
## -1.51061232
## NDSSName.my.fctrForeign#World#AsiaPacific
## -1.63313235
## NDSSName.my.fctrMetro#N.Y./Region#
## 0.02466697
## NDSSName.my.fctrOpEd#Opinion#
## 4.54361134
## NDSSName.my.fctrScience#Health#
## 3.53210055
## NDSSName.my.fctrStyles##Fashion
## -1.92188290
## NDSSName.my.fctrStyles#U.S.#
## 3.29488750
## NDSSName.my.fctrTStyle##
## -1.57788931
## NDSSName.my.fctrTravel#Travel#
## -1.47368131
## NDSSName.my.fctrmyOther
## -1.97357582
## WordCount.root2
## 0.08537319
## Prediction
## Reference N Y
## N 3796 145
## Y 177 686
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.329725e-01 7.692476e-01 9.255302e-01 9.398832e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 1.026390e-114 8.406670e-02
## Prediction
## Reference N Y
## N 1151 347
## Y 67 163
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.604167e-01 3.148374e-01 7.395703e-01 7.803749e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 8.593187e-43
## id feats
## 1 Max.cor.Y.rcv.1X1###glmnet WordCount.root2,NDSSName.my.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 0 1.051 0.279
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8790544 0.9632073 0.7949015 0.9608594
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.8099174 0.9329725
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9255302 0.9398832 0.7692476
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5962443 0.9098798 0.2826087 0.8116126
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4405405 0.7604167
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7395703 0.7803749 0.3148374
# rcv_n_folds == 1 & rcv_n_repeats > 1 crashes
for (rcv_n_folds in seq(3, glb_rcv_n_folds + 2, 2))
for (rcv_n_repeats in seq(1, glb_rcv_n_repeats + 2, 2)) {
# Experiment specific code to avoid caret crash
# lcl_tune_models_df <- rbind(data.frame()
# ,data.frame(method = "glmnet", parameter = "alpha",
# vals = "0.100 0.325 0.550 0.775 1.000")
# ,data.frame(method = "glmnet", parameter = "lambda",
# vals = "9.342e-02")
# )
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst =
list(
id.prefix = paste0("Max.cor.Y.rcv.", rcv_n_folds, "X", rcv_n_repeats),
type = glb_model_type,
# tune.df = lcl_tune_models_df,
trainControl.method = "repeatedcv",
trainControl.number = rcv_n_folds,
trainControl.repeats = rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.method = "glmnet", train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize)),
indep_vars = max_cor_y_x_vars, rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
## [1] "fitting model: Max.cor.Y.rcv.3X1##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.325, lambda = 0.0201 on full training set
## Length Class Mode
## a0 99 -none- numeric
## beta 2079 dgCMatrix S4
## df 99 -none- numeric
## dim 2 -none- numeric
## lambda 99 -none- numeric
## dev.ratio 99 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 21 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.89350373
## NDSSName.my.fctr#Multimedia#
## -0.01916344
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.18453357
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.21701058
## NDSSName.my.fctr#U.S.#Education
## -0.47679040
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.09891374
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.87281404
## NDSSName.my.fctrBusiness#Technology#
## 0.40965256
## NDSSName.my.fctrForeign#World#
## -0.05114617
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.47464340
## NDSSName.my.fctrOpEd#Opinion#
## 3.95214357
## NDSSName.my.fctrScience#Health#
## 3.14232408
## NDSSName.my.fctrStyles##Fashion
## -0.31867093
## NDSSName.my.fctrStyles#U.S.#
## 2.92610567
## NDSSName.my.fctrTStyle##
## -0.60538025
## WordCount.root2
## 0.05783392
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.95644632
## NDSSName.my.fctr#Multimedia#
## -0.07182859
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.30034382
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.29694236
## NDSSName.my.fctr#U.S.#Education
## -0.53415905
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.14259759
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.94231812
## NDSSName.my.fctrBusiness#Technology#
## 0.45657914
## NDSSName.my.fctrForeign#World#
## -0.10021084
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.53077048
## NDSSName.my.fctrOpEd#Opinion#
## 4.01324666
## NDSSName.my.fctrScience#Health#
## 3.18936803
## NDSSName.my.fctrStyles##Fashion
## -0.38069674
## NDSSName.my.fctrStyles#U.S.#
## 2.97176051
## NDSSName.my.fctrTStyle##
## -0.64837249
## WordCount.root2
## 0.05978318
## Prediction
## Reference N Y
## N 3796 145
## Y 177 686
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.329725e-01 7.692476e-01 9.255302e-01 9.398832e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 1.026390e-114 8.406670e-02
## Prediction
## Reference N Y
## N 1146 352
## Y 67 163
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.575231e-01 3.107477e-01 7.365992e-01 7.775689e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 9.066396e-44
## id feats
## 1 Max.cor.Y.rcv.3X1##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 2.538 0.273
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8767919 0.964476 0.7891078 0.9582555
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.4 0.8099174 0.9335973
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9255302 0.9398832 0.7691678
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5962443 0.9098798 0.2826087 0.8067975
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4375839 0.7575231
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7365992 0.7775689 0.3107477
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.007015493 0.02403706
## [1] "fitting model: Max.cor.Y.rcv.3X3##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.325, lambda = 0.0201 on full training set
## Length Class Mode
## a0 99 -none- numeric
## beta 2079 dgCMatrix S4
## df 99 -none- numeric
## dim 2 -none- numeric
## lambda 99 -none- numeric
## dev.ratio 99 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 21 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.89350373
## NDSSName.my.fctr#Multimedia#
## -0.01916344
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.18453357
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.21701058
## NDSSName.my.fctr#U.S.#Education
## -0.47679040
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.09891374
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.87281404
## NDSSName.my.fctrBusiness#Technology#
## 0.40965256
## NDSSName.my.fctrForeign#World#
## -0.05114617
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.47464340
## NDSSName.my.fctrOpEd#Opinion#
## 3.95214357
## NDSSName.my.fctrScience#Health#
## 3.14232408
## NDSSName.my.fctrStyles##Fashion
## -0.31867093
## NDSSName.my.fctrStyles#U.S.#
## 2.92610567
## NDSSName.my.fctrTStyle##
## -0.60538025
## WordCount.root2
## 0.05783392
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.95644632
## NDSSName.my.fctr#Multimedia#
## -0.07182859
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.30034382
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.29694236
## NDSSName.my.fctr#U.S.#Education
## -0.53415905
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.14259759
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.94231812
## NDSSName.my.fctrBusiness#Technology#
## 0.45657914
## NDSSName.my.fctrForeign#World#
## -0.10021084
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.53077048
## NDSSName.my.fctrOpEd#Opinion#
## 4.01324666
## NDSSName.my.fctrScience#Health#
## 3.18936803
## NDSSName.my.fctrStyles##Fashion
## -0.38069674
## NDSSName.my.fctrStyles#U.S.#
## 2.97176051
## NDSSName.my.fctrTStyle##
## -0.64837249
## WordCount.root2
## 0.05978318
## Prediction
## Reference N Y
## N 3796 145
## Y 177 686
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.329725e-01 7.692476e-01 9.255302e-01 9.398832e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 1.026390e-114 8.406670e-02
## Prediction
## Reference N Y
## N 1146 352
## Y 67 163
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.575231e-01 3.107477e-01 7.365992e-01 7.775689e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 9.066396e-44
## id feats
## 1 Max.cor.Y.rcv.3X3##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 4.667 0.274
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8767919 0.964476 0.7891078 0.9582555
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.4 0.8099174 0.9333193
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9255302 0.9398832 0.7690803
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5962443 0.9098798 0.2826087 0.8067975
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4375839 0.7575231
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7365992 0.7775689 0.3107477
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.005178375 0.01754365
## [1] "fitting model: Max.cor.Y.rcv.3X5##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.325, lambda = 0.0201 on full training set
## Length Class Mode
## a0 99 -none- numeric
## beta 2079 dgCMatrix S4
## df 99 -none- numeric
## dim 2 -none- numeric
## lambda 99 -none- numeric
## dev.ratio 99 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 21 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.89350373
## NDSSName.my.fctr#Multimedia#
## -0.01916344
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.18453357
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.21701058
## NDSSName.my.fctr#U.S.#Education
## -0.47679040
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.09891374
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.87281404
## NDSSName.my.fctrBusiness#Technology#
## 0.40965256
## NDSSName.my.fctrForeign#World#
## -0.05114617
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.47464340
## NDSSName.my.fctrOpEd#Opinion#
## 3.95214357
## NDSSName.my.fctrScience#Health#
## 3.14232408
## NDSSName.my.fctrStyles##Fashion
## -0.31867093
## NDSSName.my.fctrStyles#U.S.#
## 2.92610567
## NDSSName.my.fctrTStyle##
## -0.60538025
## WordCount.root2
## 0.05783392
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.95644632
## NDSSName.my.fctr#Multimedia#
## -0.07182859
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.30034382
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.29694236
## NDSSName.my.fctr#U.S.#Education
## -0.53415905
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.14259759
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.94231812
## NDSSName.my.fctrBusiness#Technology#
## 0.45657914
## NDSSName.my.fctrForeign#World#
## -0.10021084
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.53077048
## NDSSName.my.fctrOpEd#Opinion#
## 4.01324666
## NDSSName.my.fctrScience#Health#
## 3.18936803
## NDSSName.my.fctrStyles##Fashion
## -0.38069674
## NDSSName.my.fctrStyles#U.S.#
## 2.97176051
## NDSSName.my.fctrTStyle##
## -0.64837249
## WordCount.root2
## 0.05978318
## Prediction
## Reference N Y
## N 3796 145
## Y 177 686
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.329725e-01 7.692476e-01 9.255302e-01 9.398832e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 1.026390e-114 8.406670e-02
## Prediction
## Reference N Y
## N 1146 352
## Y 67 163
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.575231e-01 3.107477e-01 7.365992e-01 7.775689e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 9.066396e-44
## id feats
## 1 Max.cor.Y.rcv.3X5##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 7.072 0.276
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8767919 0.964476 0.7891078 0.9582555
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.4 0.8099174 0.9332218
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9255302 0.9398832 0.7686375
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5962443 0.9098798 0.2826087 0.8067975
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4375839 0.7575231
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7365992 0.7775689 0.3107477
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.005396525 0.01835474
## [1] "fitting model: Max.cor.Y.rcv.5X1##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0201 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst =
## list(id.prefix = paste0("Max.cor.Y.rcv.", : model's bestTune found at an
## extreme of tuneGrid for parameter: alpha
## Length Class Mode
## a0 100 -none- numeric
## beta 2100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 21 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.81141260
## NDSSName.my.fctr#Multimedia#
## -0.68105584
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.92624537
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.40699589
## NDSSName.my.fctr#U.S.#Education
## -0.98291999
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.22577146
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.64343834
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.82797332
## NDSSName.my.fctrBusiness#Technology#
## 0.45317927
## NDSSName.my.fctrCulture#Arts#
## -0.17187706
## NDSSName.my.fctrForeign#World#
## -0.72035867
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.99018968
## NDSSName.my.fctrOpEd#Opinion#
## 3.81891156
## NDSSName.my.fctrScience#Health#
## 3.05516080
## NDSSName.my.fctrStyles##Fashion
## -0.97651721
## NDSSName.my.fctrStyles#U.S.#
## 2.84779285
## NDSSName.my.fctrTStyle##
## -0.94109645
## NDSSName.my.fctrTravel#Travel#
## -0.68827560
## NDSSName.my.fctrmyOther
## -0.84423735
## WordCount.root2
## 0.06115867
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.87108412
## NDSSName.my.fctr#Multimedia#
## -0.71588942
## NDSSName.my.fctr#Opinion#RoomForDebate
## -2.02010163
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.46715540
## NDSSName.my.fctr#U.S.#Education
## -1.02957582
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.22558850
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.66798026
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.89132347
## NDSSName.my.fctrBusiness#Technology#
## 0.48212450
## NDSSName.my.fctrCulture#Arts#
## -0.16733777
## NDSSName.my.fctrForeign#World#
## -0.75793881
## NDSSName.my.fctrForeign#World#AsiaPacific
## -1.03076807
## NDSSName.my.fctrOpEd#Opinion#
## 3.87908175
## NDSSName.my.fctrScience#Health#
## 3.09788786
## NDSSName.my.fctrStyles##Fashion
## -1.02481879
## NDSSName.my.fctrStyles#U.S.#
## 2.88826078
## NDSSName.my.fctrTStyle##
## -0.97585470
## NDSSName.my.fctrTravel#Travel#
## -0.72668427
## NDSSName.my.fctrmyOther
## -0.90347045
## WordCount.root2
## 0.06289698
## Prediction
## Reference N Y
## N 3800 141
## Y 179 684
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.333888e-01 7.700473e-01 9.259666e-01 9.402789e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 1.097051e-115 3.860591e-02
## Prediction
## Reference N Y
## N 1137 361
## Y 53 177
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.604167e-01 3.373693e-01 7.395703e-01 7.803749e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 1.935747e-51
## id feats
## 1 Max.cor.Y.rcv.5X1##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 3.53 0.277
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8784031 0.9642223 0.792584 0.9607052
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.8104265 0.9331818
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9259666 0.9402789 0.7689055
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5962443 0.9098798 0.2826087 0.8114863
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4609375 0.7604167
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7395703 0.7803749 0.3373693
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.008837283 0.03133449
## [1] "fitting model: Max.cor.Y.rcv.5X3##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0201 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst =
## list(id.prefix = paste0("Max.cor.Y.rcv.", : model's bestTune found at an
## extreme of tuneGrid for parameter: alpha
## Length Class Mode
## a0 100 -none- numeric
## beta 2100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 21 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.81141260
## NDSSName.my.fctr#Multimedia#
## -0.68105584
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.92624537
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.40699589
## NDSSName.my.fctr#U.S.#Education
## -0.98291999
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.22577146
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.64343834
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.82797332
## NDSSName.my.fctrBusiness#Technology#
## 0.45317927
## NDSSName.my.fctrCulture#Arts#
## -0.17187706
## NDSSName.my.fctrForeign#World#
## -0.72035867
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.99018968
## NDSSName.my.fctrOpEd#Opinion#
## 3.81891156
## NDSSName.my.fctrScience#Health#
## 3.05516080
## NDSSName.my.fctrStyles##Fashion
## -0.97651721
## NDSSName.my.fctrStyles#U.S.#
## 2.84779285
## NDSSName.my.fctrTStyle##
## -0.94109645
## NDSSName.my.fctrTravel#Travel#
## -0.68827560
## NDSSName.my.fctrmyOther
## -0.84423735
## WordCount.root2
## 0.06115867
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.87108412
## NDSSName.my.fctr#Multimedia#
## -0.71588942
## NDSSName.my.fctr#Opinion#RoomForDebate
## -2.02010163
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.46715540
## NDSSName.my.fctr#U.S.#Education
## -1.02957582
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.22558850
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.66798026
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.89132347
## NDSSName.my.fctrBusiness#Technology#
## 0.48212450
## NDSSName.my.fctrCulture#Arts#
## -0.16733777
## NDSSName.my.fctrForeign#World#
## -0.75793881
## NDSSName.my.fctrForeign#World#AsiaPacific
## -1.03076807
## NDSSName.my.fctrOpEd#Opinion#
## 3.87908175
## NDSSName.my.fctrScience#Health#
## 3.09788786
## NDSSName.my.fctrStyles##Fashion
## -1.02481879
## NDSSName.my.fctrStyles#U.S.#
## 2.88826078
## NDSSName.my.fctrTStyle##
## -0.97585470
## NDSSName.my.fctrTravel#Travel#
## -0.72668427
## NDSSName.my.fctrmyOther
## -0.90347045
## WordCount.root2
## 0.06289698
## Prediction
## Reference N Y
## N 3800 141
## Y 179 684
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.333888e-01 7.700473e-01 9.259666e-01 9.402789e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 1.097051e-115 3.860591e-02
## Prediction
## Reference N Y
## N 1137 361
## Y 53 177
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.604167e-01 3.373693e-01 7.395703e-01 7.803749e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 1.935747e-51
## id feats
## 1 Max.cor.Y.rcv.5X3##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 7.415 0.285
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8784031 0.9642223 0.792584 0.9607052
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.8104265 0.9333905
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9259666 0.9402789 0.7698577
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5962443 0.9098798 0.2826087 0.8114863
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4609375 0.7604167
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7395703 0.7803749 0.3373693
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.006138477 0.02161286
## [1] "fitting model: Max.cor.Y.rcv.5X5##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0201 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst =
## list(id.prefix = paste0("Max.cor.Y.rcv.", : model's bestTune found at an
## extreme of tuneGrid for parameter: alpha
## Length Class Mode
## a0 100 -none- numeric
## beta 2100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 21 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.81141260
## NDSSName.my.fctr#Multimedia#
## -0.68105584
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.92624537
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.40699589
## NDSSName.my.fctr#U.S.#Education
## -0.98291999
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.22577146
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.64343834
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.82797332
## NDSSName.my.fctrBusiness#Technology#
## 0.45317927
## NDSSName.my.fctrCulture#Arts#
## -0.17187706
## NDSSName.my.fctrForeign#World#
## -0.72035867
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.99018968
## NDSSName.my.fctrOpEd#Opinion#
## 3.81891156
## NDSSName.my.fctrScience#Health#
## 3.05516080
## NDSSName.my.fctrStyles##Fashion
## -0.97651721
## NDSSName.my.fctrStyles#U.S.#
## 2.84779285
## NDSSName.my.fctrTStyle##
## -0.94109645
## NDSSName.my.fctrTravel#Travel#
## -0.68827560
## NDSSName.my.fctrmyOther
## -0.84423735
## WordCount.root2
## 0.06115867
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.87108412
## NDSSName.my.fctr#Multimedia#
## -0.71588942
## NDSSName.my.fctr#Opinion#RoomForDebate
## -2.02010163
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 3.46715540
## NDSSName.my.fctr#U.S.#Education
## -1.02957582
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.22558850
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.66798026
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.89132347
## NDSSName.my.fctrBusiness#Technology#
## 0.48212450
## NDSSName.my.fctrCulture#Arts#
## -0.16733777
## NDSSName.my.fctrForeign#World#
## -0.75793881
## NDSSName.my.fctrForeign#World#AsiaPacific
## -1.03076807
## NDSSName.my.fctrOpEd#Opinion#
## 3.87908175
## NDSSName.my.fctrScience#Health#
## 3.09788786
## NDSSName.my.fctrStyles##Fashion
## -1.02481879
## NDSSName.my.fctrStyles#U.S.#
## 2.88826078
## NDSSName.my.fctrTStyle##
## -0.97585470
## NDSSName.my.fctrTravel#Travel#
## -0.72668427
## NDSSName.my.fctrmyOther
## -0.90347045
## WordCount.root2
## 0.06289698
## Prediction
## Reference N Y
## N 3800 141
## Y 179 684
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.333888e-01 7.700473e-01 9.259666e-01 9.402789e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 1.097051e-115 3.860591e-02
## Prediction
## Reference N Y
## N 1137 361
## Y 53 177
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.604167e-01 3.373693e-01 7.395703e-01 7.803749e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 1.935747e-51
## id feats
## 1 Max.cor.Y.rcv.5X5##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 9.956 0.275
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8784031 0.9642223 0.792584 0.9607052
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.8104265 0.9331816
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9259666 0.9402789 0.7691429
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5962443 0.9098798 0.2826087 0.8114863
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4609375 0.7604167
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7395703 0.7803749 0.3373693
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.0062138 0.02210061
# Add parallel coordinates graph of glb_models_df[, glbMdlMetricsEval] to evaluate cv parameters
tmp_models_cols <- c("id", "max.nTuningRuns",
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)],
grep("opt.", names(glb_models_df), fixed = TRUE, value = TRUE))
print(myplot_parcoord(obs_df = subset(glb_models_df,
grepl("Max.cor.Y.rcv.", id, fixed = TRUE),
select = -feats)[, tmp_models_cols],
id_var = "id"))
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Max.cor.Y.rcv.1X1.cp.0", type=glb_model_type, trainControl.method="none",
train.method="rpart",
tune.df=data.frame(method="rpart", parameter="cp", min=0.0, max=0.0, by=0.1))),
indep_vars=max_cor_y_x_vars, rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
## [1] "fitting model: Max.cor.Y.rcv.1X1.cp.0###rpart"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## Loading required package: rpart
## Fitting cp = 0 on full training set
## Loading required package: rpart.plot
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 4804
##
## CP nsplit rel error
## 1 0.3696407879 0 1.0000000
## 2 0.0984936269 1 0.6303592
## 3 0.0857473928 2 0.5318656
## 4 0.0567786790 3 0.4461182
## 5 0.0104287370 4 0.3893395
## 6 0.0057937428 5 0.3789108
## 7 0.0034762457 7 0.3673233
## 8 0.0023174971 8 0.3638470
## 9 0.0011587486 11 0.3568946
## 10 0.0007724990 13 0.3545771
## 11 0.0005793743 16 0.3522596
## 12 0.0004213631 24 0.3476246
## 13 0.0003862495 35 0.3429896
## 14 0.0000000000 41 0.3406721
##
## Variable importance
## NDSSName.my.fctrOpEd#Opinion#
## 48
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 14
## NDSSName.my.fctrScience#Health#
## 14
## NDSSName.my.fctrStyles#U.S.#
## 11
## WordCount.root2
## 9
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 2
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## 1
##
## Node number 1: 4804 observations, complexity param=0.3696408
## predicted class=N expected loss=0.179642 P(node) =1
## class counts: 3941 863
## probabilities: 0.820 0.180
## left son=2 (4367 obs) right son=3 (437 obs)
## Primary splits:
## NDSSName.my.fctrOpEd#Opinion# < 0.5 to the left, improve=451.59770, (0 missing)
## NDSSName.my.fctrBusiness#Crosswords/Games# < 0.5 to the left, improve=112.88510, (0 missing)
## WordCount.root2 < 25.75849 to the left, improve=111.17610, (0 missing)
## NDSSName.my.fctrScience#Health# < 0.5 to the left, improve= 99.35206, (0 missing)
## NDSSName.my.fctrStyles#U.S.# < 0.5 to the left, improve= 68.73272, (0 missing)
##
## Node number 2: 4367 observations, complexity param=0.09849363
## predicted class=N expected loss=0.1110602 P(node) =0.9090341
## class counts: 3882 485
## probabilities: 0.889 0.111
## left son=4 (4262 obs) right son=5 (105 obs)
## Primary splits:
## NDSSName.my.fctrBusiness#Crosswords/Games# < 0.5 to the left, improve=135.55130, (0 missing)
## NDSSName.my.fctrScience#Health# < 0.5 to the left, improve=125.07920, (0 missing)
## WordCount.root2 < 25.75849 to the left, improve= 94.70710, (0 missing)
## NDSSName.my.fctrStyles#U.S.# < 0.5 to the left, improve= 88.56821, (0 missing)
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, improve= 18.74400, (0 missing)
##
## Node number 3: 437 observations
## predicted class=Y expected loss=0.1350114 P(node) =0.09096586
## class counts: 59 378
## probabilities: 0.135 0.865
##
## Node number 4: 4262 observations, complexity param=0.08574739
## predicted class=N expected loss=0.09150634 P(node) =0.8871774
## class counts: 3872 390
## probabilities: 0.908 0.092
## left son=8 (4114 obs) right son=9 (148 obs)
## Primary splits:
## NDSSName.my.fctrScience#Health# < 0.5 to the left, improve=132.96710, (0 missing)
## NDSSName.my.fctrStyles#U.S.# < 0.5 to the left, improve= 94.69099, (0 missing)
## WordCount.root2 < 26.49528 to the left, improve= 84.07487, (0 missing)
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, improve= 19.71762, (0 missing)
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve= 10.17000, (0 missing)
##
## Node number 5: 105 observations, complexity param=0.002317497
## predicted class=Y expected loss=0.0952381 P(node) =0.02185679
## class counts: 10 95
## probabilities: 0.095 0.905
## left son=10 (12 obs) right son=11 (93 obs)
## Primary splits:
## WordCount.root2 < 18.9043 to the left, improve=6.455453, (0 missing)
##
## Node number 8: 4114 observations, complexity param=0.05677868
## predicted class=N expected loss=0.06781721 P(node) =0.8563697
## class counts: 3835 279
## probabilities: 0.932 0.068
## left son=16 (3987 obs) right son=17 (127 obs)
## Primary splits:
## NDSSName.my.fctrStyles#U.S.# < 0.5 to the left, improve=102.410700, (0 missing)
## WordCount.root2 < 25.01 to the left, improve= 47.352210, (0 missing)
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, improve= 20.930810, (0 missing)
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve= 5.249425, (0 missing)
## NDSSName.my.fctrBusiness#Technology# < 0.5 to the left, improve= 2.395935, (0 missing)
##
## Node number 9: 148 observations, complexity param=0.01042874
## predicted class=Y expected loss=0.25 P(node) =0.03080766
## class counts: 37 111
## probabilities: 0.250 0.750
## left son=18 (55 obs) right son=19 (93 obs)
## Primary splits:
## WordCount.root2 < 22.72663 to the left, improve=19.274, (0 missing)
##
## Node number 10: 12 observations
## predicted class=N expected loss=0.4166667 P(node) =0.002497918
## class counts: 7 5
## probabilities: 0.583 0.417
##
## Node number 11: 93 observations
## predicted class=Y expected loss=0.03225806 P(node) =0.01935887
## class counts: 3 90
## probabilities: 0.032 0.968
##
## Node number 16: 3987 observations, complexity param=0.005793743
## predicted class=N expected loss=0.04790569 P(node) =0.8299334
## class counts: 3796 191
## probabilities: 0.952 0.048
## left son=32 (2982 obs) right son=33 (1005 obs)
## Primary splits:
## WordCount.root2 < 25.01 to the left, improve=29.253580, (0 missing)
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, improve=21.978920, (0 missing)
## NDSSName.my.fctrBusiness#Technology# < 0.5 to the left, improve= 3.887348, (0 missing)
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve= 2.348653, (0 missing)
## NDSSName.my.fctr#U.S.#Education < 0.5 to the right, improve= 1.187739, (0 missing)
## Surrogate splits:
## NDSSName.my.fctr#Opinion#RoomForDebate < 0.5 to the left, agree=0.758, adj=0.042, (0 split)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the left, agree=0.752, adj=0.016, (0 split)
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, agree=0.750, adj=0.008, (0 split)
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness < 0.5 to the left, agree=0.748, adj=0.002, (0 split)
##
## Node number 17: 127 observations, complexity param=0.003476246
## predicted class=Y expected loss=0.3070866 P(node) =0.0264363
## class counts: 39 88
## probabilities: 0.307 0.693
## left son=34 (13 obs) right son=35 (114 obs)
## Primary splits:
## WordCount.root2 < 15.32846 to the left, improve=2.753047, (0 missing)
##
## Node number 18: 55 observations, complexity param=0.002317497
## predicted class=N expected loss=0.4181818 P(node) =0.01144879
## class counts: 32 23
## probabilities: 0.582 0.418
## left son=36 (9 obs) right son=37 (46 obs)
## Primary splits:
## WordCount.root2 < 19.93708 to the right, improve=0.8264383, (0 missing)
##
## Node number 19: 93 observations
## predicted class=Y expected loss=0.05376344 P(node) =0.01935887
## class counts: 5 88
## probabilities: 0.054 0.946
##
## Node number 32: 2982 observations
## predicted class=N expected loss=0.01274313 P(node) =0.6207327
## class counts: 2944 38
## probabilities: 0.987 0.013
##
## Node number 33: 1005 observations, complexity param=0.005793743
## predicted class=N expected loss=0.1522388 P(node) =0.2092007
## class counts: 852 153
## probabilities: 0.848 0.152
## left son=66 (993 obs) right son=67 (12 obs)
## Primary splits:
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, improve=14.193880, (0 missing)
## NDSSName.my.fctrCulture#Arts# < 0.5 to the left, improve= 3.669601, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve= 3.556158, (0 missing)
## WordCount.root2 < 34.19795 to the left, improve= 2.582851, (0 missing)
## NDSSName.my.fctr#Opinion#RoomForDebate < 0.5 to the right, improve= 2.031748, (0 missing)
##
## Node number 34: 13 observations
## predicted class=N expected loss=0.3846154 P(node) =0.002706078
## class counts: 8 5
## probabilities: 0.615 0.385
##
## Node number 35: 114 observations, complexity param=0.000772499
## predicted class=Y expected loss=0.2719298 P(node) =0.02373022
## class counts: 31 83
## probabilities: 0.272 0.728
## left son=70 (79 obs) right son=71 (35 obs)
## Primary splits:
## WordCount.root2 < 29.21444 to the left, improve=1.020279, (0 missing)
##
## Node number 36: 9 observations
## predicted class=N expected loss=0.2222222 P(node) =0.001873439
## class counts: 7 2
## probabilities: 0.778 0.222
##
## Node number 37: 46 observations, complexity param=0.002317497
## predicted class=N expected loss=0.4565217 P(node) =0.009575354
## class counts: 25 21
## probabilities: 0.543 0.457
## left son=74 (36 obs) right son=75 (10 obs)
## Primary splits:
## WordCount.root2 < 17.01454 to the left, improve=1.514976, (0 missing)
##
## Node number 66: 993 observations, complexity param=0.0005793743
## predicted class=N expected loss=0.143001 P(node) =0.2067027
## class counts: 851 142
## probabilities: 0.857 0.143
## left son=132 (930 obs) right son=133 (63 obs)
## Primary splits:
## NDSSName.my.fctrCulture#Arts# < 0.5 to the left, improve=4.094729, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=3.106316, (0 missing)
## WordCount.root2 < 29.5127 to the left, improve=2.722793, (0 missing)
## NDSSName.my.fctrBusiness#Technology# < 0.5 to the left, improve=1.962300, (0 missing)
## NDSSName.my.fctr#Opinion#RoomForDebate < 0.5 to the right, improve=1.793603, (0 missing)
##
## Node number 67: 12 observations
## predicted class=Y expected loss=0.08333333 P(node) =0.002497918
## class counts: 1 11
## probabilities: 0.083 0.917
##
## Node number 70: 79 observations, complexity param=0.000772499
## predicted class=Y expected loss=0.3164557 P(node) =0.01644463
## class counts: 25 54
## probabilities: 0.316 0.684
## left son=140 (25 obs) right son=141 (54 obs)
## Primary splits:
## WordCount.root2 < 27.36786 to the right, improve=0.5105485, (0 missing)
##
## Node number 71: 35 observations
## predicted class=Y expected loss=0.1714286 P(node) =0.007285595
## class counts: 6 29
## probabilities: 0.171 0.829
##
## Node number 74: 36 observations, complexity param=0.001158749
## predicted class=N expected loss=0.3888889 P(node) =0.007493755
## class counts: 22 14
## probabilities: 0.611 0.389
## left son=148 (8 obs) right son=149 (28 obs)
## Primary splits:
## WordCount.root2 < 15.74773 to the right, improve=0.3968254, (0 missing)
##
## Node number 75: 10 observations
## predicted class=Y expected loss=0.3 P(node) =0.002081599
## class counts: 3 7
## probabilities: 0.300 0.700
##
## Node number 132: 930 observations, complexity param=0.0005793743
## predicted class=N expected loss=0.1311828 P(node) =0.1935887
## class counts: 808 122
## probabilities: 0.869 0.131
## left son=264 (627 obs) right son=265 (303 obs)
## Primary splits:
## WordCount.root2 < 33.97057 to the left, improve=2.913816, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=2.586923, (0 missing)
## NDSSName.my.fctrBusiness#Technology# < 0.5 to the left, improve=2.402029, (0 missing)
## NDSSName.my.fctr#Opinion#RoomForDebate < 0.5 to the right, improve=1.513920, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook < 0.5 to the left, improve=1.276783, (0 missing)
## Surrogate splits:
## NDSSName.my.fctr#Opinion#RoomForDebate < 0.5 to the left, agree=0.719, adj=0.139, (0 split)
##
## Node number 133: 63 observations, complexity param=0.0003862495
## predicted class=N expected loss=0.3174603 P(node) =0.01311407
## class counts: 43 20
## probabilities: 0.683 0.317
## left son=266 (14 obs) right son=267 (49 obs)
## Primary splits:
## WordCount.root2 < 26.99984 to the left, improve=0.38322, (0 missing)
##
## Node number 140: 25 observations, complexity param=0.000772499
## predicted class=Y expected loss=0.4 P(node) =0.005203997
## class counts: 10 15
## probabilities: 0.400 0.600
## left son=280 (8 obs) right son=281 (17 obs)
## Primary splits:
## WordCount.root2 < 28.02674 to the left, improve=1.191176, (0 missing)
##
## Node number 141: 54 observations, complexity param=0.0003862495
## predicted class=Y expected loss=0.2777778 P(node) =0.01124063
## class counts: 15 39
## probabilities: 0.278 0.722
## left son=282 (45 obs) right son=283 (9 obs)
## Primary splits:
## WordCount.root2 < 26.55173 to the left, improve=0.6, (0 missing)
##
## Node number 148: 8 observations
## predicted class=N expected loss=0.25 P(node) =0.001665279
## class counts: 6 2
## probabilities: 0.750 0.250
##
## Node number 149: 28 observations, complexity param=0.001158749
## predicted class=N expected loss=0.4285714 P(node) =0.005828476
## class counts: 16 12
## probabilities: 0.571 0.429
## left son=298 (20 obs) right son=299 (8 obs)
## Primary splits:
## WordCount.root2 < 15.06648 to the left, improve=0.8642857, (0 missing)
##
## Node number 264: 627 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.1036683 P(node) =0.1305162
## class counts: 562 65
## probabilities: 0.896 0.104
## left son=528 (561 obs) right son=529 (66 obs)
## Primary splits:
## NDSSName.my.fctrBusiness#Technology# < 0.5 to the left, improve=2.8404170, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=1.0796950, (0 missing)
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve=1.0670160, (0 missing)
## WordCount.root2 < 29.5127 to the left, improve=0.8966879, (0 missing)
## NDSSName.my.fctr#Multimedia# < 0.5 to the right, improve=0.4399337, (0 missing)
##
## Node number 265: 303 observations, complexity param=0.0005793743
## predicted class=N expected loss=0.1881188 P(node) =0.06307244
## class counts: 246 57
## probabilities: 0.812 0.188
## left son=530 (222 obs) right son=531 (81 obs)
## Primary splits:
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook < 0.5 to the left, improve=5.4890570, (0 missing)
## WordCount.root2 < 38.17067 to the right, improve=5.0156320, (0 missing)
## NDSSName.my.fctr#Opinion#RoomForDebate < 0.5 to the right, improve=3.4510070, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=1.5155860, (0 missing)
## NDSSName.my.fctr#U.S.#Education < 0.5 to the right, improve=0.8078801, (0 missing)
## Surrogate splits:
## WordCount.root2 < 34.08078 to the right, agree=0.739, adj=0.025, (0 split)
##
## Node number 266: 14 observations
## predicted class=N expected loss=0.2142857 P(node) =0.002914238
## class counts: 11 3
## probabilities: 0.786 0.214
##
## Node number 267: 49 observations, complexity param=0.0003862495
## predicted class=N expected loss=0.3469388 P(node) =0.01019983
## class counts: 32 17
## probabilities: 0.653 0.347
## left son=534 (10 obs) right son=535 (39 obs)
## Primary splits:
## WordCount.root2 < 41.56249 to the right, improve=0.5425432, (0 missing)
##
## Node number 280: 8 observations
## predicted class=N expected loss=0.375 P(node) =0.001665279
## class counts: 5 3
## probabilities: 0.625 0.375
##
## Node number 281: 17 observations
## predicted class=Y expected loss=0.2941176 P(node) =0.003538718
## class counts: 5 12
## probabilities: 0.294 0.706
##
## Node number 282: 45 observations, complexity param=0.0003862495
## predicted class=Y expected loss=0.3111111 P(node) =0.009367194
## class counts: 14 31
## probabilities: 0.311 0.689
## left son=564 (23 obs) right son=565 (22 obs)
## Primary splits:
## WordCount.root2 < 21.70252 to the right, improve=0.6050944, (0 missing)
##
## Node number 283: 9 observations
## predicted class=Y expected loss=0.1111111 P(node) =0.001873439
## class counts: 1 8
## probabilities: 0.111 0.889
##
## Node number 298: 20 observations
## predicted class=N expected loss=0.35 P(node) =0.004163197
## class counts: 13 7
## probabilities: 0.650 0.350
##
## Node number 299: 8 observations
## predicted class=Y expected loss=0.375 P(node) =0.001665279
## class counts: 3 5
## probabilities: 0.375 0.625
##
## Node number 528: 561 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.08734403 P(node) =0.1167777
## class counts: 512 49
## probabilities: 0.913 0.087
## left son=1056 (281 obs) right son=1057 (280 obs)
## Primary splits:
## WordCount.root2 < 29.33428 to the left, improve=1.5853030, (0 missing)
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve=0.7645570, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=0.7250433, (0 missing)
## NDSSName.my.fctrStyles##Fashion < 0.5 to the right, improve=0.3000638, (0 missing)
## NDSSName.my.fctr#Multimedia# < 0.5 to the right, improve=0.2729836, (0 missing)
## Surrogate splits:
## NDSSName.my.fctrMetro#N.Y./Region# < 0.5 to the left, agree=0.560, adj=0.118, (0 split)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, agree=0.533, adj=0.064, (0 split)
## NDSSName.my.fctr#Multimedia# < 0.5 to the right, agree=0.524, adj=0.046, (0 split)
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness < 0.5 to the left, agree=0.515, adj=0.029, (0 split)
## NDSSName.my.fctrStyles##Fashion < 0.5 to the right, agree=0.512, adj=0.021, (0 split)
##
## Node number 529: 66 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.2424242 P(node) =0.01373855
## class counts: 50 16
## probabilities: 0.758 0.242
## left son=1058 (38 obs) right son=1059 (28 obs)
## Primary splits:
## WordCount.root2 < 27.86575 to the left, improve=0.6070859, (0 missing)
##
## Node number 530: 222 observations
## predicted class=N expected loss=0.1306306 P(node) =0.04621149
## class counts: 193 29
## probabilities: 0.869 0.131
##
## Node number 531: 81 observations, complexity param=0.0005793743
## predicted class=N expected loss=0.345679 P(node) =0.01686095
## class counts: 53 28
## probabilities: 0.654 0.346
## left son=1062 (15 obs) right son=1063 (66 obs)
## Primary splits:
## WordCount.root2 < 41.59766 to the right, improve=2.866218, (0 missing)
##
## Node number 534: 10 observations
## predicted class=N expected loss=0.2 P(node) =0.002081599
## class counts: 8 2
## probabilities: 0.800 0.200
##
## Node number 535: 39 observations, complexity param=0.0003862495
## predicted class=N expected loss=0.3846154 P(node) =0.008118235
## class counts: 24 15
## probabilities: 0.615 0.385
## left son=1070 (32 obs) right son=1071 (7 obs)
## Primary splits:
## WordCount.root2 < 34.23387 to the left, improve=0.595467, (0 missing)
##
## Node number 564: 23 observations, complexity param=0.0003862495
## predicted class=Y expected loss=0.3913043 P(node) =0.004787677
## class counts: 9 14
## probabilities: 0.391 0.609
## left son=1128 (7 obs) right son=1129 (16 obs)
## Primary splits:
## WordCount.root2 < 23.6326 to the left, improve=0.6529503, (0 missing)
##
## Node number 565: 22 observations
## predicted class=Y expected loss=0.2272727 P(node) =0.004579517
## class counts: 5 17
## probabilities: 0.227 0.773
##
## Node number 1056: 281 observations
## predicted class=N expected loss=0.04982206 P(node) =0.05849292
## class counts: 267 14
## probabilities: 0.950 0.050
##
## Node number 1057: 280 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.125 P(node) =0.05828476
## class counts: 245 35
## probabilities: 0.875 0.125
## left son=2114 (71 obs) right son=2115 (209 obs)
## Primary splits:
## WordCount.root2 < 32.57299 to the right, improve=0.8968765, (0 missing)
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve=0.7830739, (0 missing)
## NDSSName.my.fctrMetro#N.Y./Region# < 0.5 to the right, improve=0.3683673, (0 missing)
## NDSSName.my.fctr#Multimedia# < 0.5 to the right, improve=0.3578067, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=0.3021494, (0 missing)
##
## Node number 1058: 38 observations
## predicted class=N expected loss=0.1842105 P(node) =0.007910075
## class counts: 31 7
## probabilities: 0.816 0.184
##
## Node number 1059: 28 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.3214286 P(node) =0.005828476
## class counts: 19 9
## probabilities: 0.679 0.321
## left son=2118 (19 obs) right son=2119 (9 obs)
## Primary splits:
## WordCount.root2 < 28.6269 to the right, improve=1.454052, (0 missing)
##
## Node number 1062: 15 observations
## predicted class=N expected loss=0.06666667 P(node) =0.003122398
## class counts: 14 1
## probabilities: 0.933 0.067
##
## Node number 1063: 66 observations, complexity param=0.0005793743
## predicted class=N expected loss=0.4090909 P(node) =0.01373855
## class counts: 39 27
## probabilities: 0.591 0.409
## left son=2126 (25 obs) right son=2127 (41 obs)
## Primary splits:
## WordCount.root2 < 35.6581 to the left, improve=1.341286, (0 missing)
##
## Node number 1070: 32 observations
## predicted class=N expected loss=0.34375 P(node) =0.006661116
## class counts: 21 11
## probabilities: 0.656 0.344
##
## Node number 1071: 7 observations
## predicted class=Y expected loss=0.4285714 P(node) =0.001457119
## class counts: 3 4
## probabilities: 0.429 0.571
##
## Node number 1128: 7 observations
## predicted class=N expected loss=0.4285714 P(node) =0.001457119
## class counts: 4 3
## probabilities: 0.571 0.429
##
## Node number 1129: 16 observations
## predicted class=Y expected loss=0.3125 P(node) =0.003330558
## class counts: 5 11
## probabilities: 0.312 0.688
##
## Node number 2114: 71 observations
## predicted class=N expected loss=0.05633803 P(node) =0.01477935
## class counts: 67 4
## probabilities: 0.944 0.056
##
## Node number 2115: 209 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.1483254 P(node) =0.04350541
## class counts: 178 31
## probabilities: 0.852 0.148
## left son=4230 (12 obs) right son=4231 (197 obs)
## Primary splits:
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve=0.5601729, (0 missing)
## NDSSName.my.fctr#Multimedia# < 0.5 to the right, improve=0.5108985, (0 missing)
## WordCount.root2 < 30.09153 to the right, improve=0.4980706, (0 missing)
## NDSSName.my.fctrMetro#N.Y./Region# < 0.5 to the right, improve=0.4241343, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=0.3390226, (0 missing)
##
## Node number 2118: 19 observations
## predicted class=N expected loss=0.2105263 P(node) =0.003955037
## class counts: 15 4
## probabilities: 0.789 0.211
##
## Node number 2119: 9 observations
## predicted class=Y expected loss=0.4444444 P(node) =0.001873439
## class counts: 4 5
## probabilities: 0.444 0.556
##
## Node number 2126: 25 observations
## predicted class=N expected loss=0.28 P(node) =0.005203997
## class counts: 18 7
## probabilities: 0.720 0.280
##
## Node number 2127: 41 observations, complexity param=0.0005793743
## predicted class=N expected loss=0.4878049 P(node) =0.008534555
## class counts: 21 20
## probabilities: 0.512 0.488
## left son=4254 (30 obs) right son=4255 (11 obs)
## Primary splits:
## WordCount.root2 < 36.31791 to the right, improve=0.6635625, (0 missing)
##
## Node number 4230: 12 observations
## predicted class=N expected loss=0 P(node) =0.002497918
## class counts: 12 0
## probabilities: 1.000 0.000
##
## Node number 4231: 197 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.1573604 P(node) =0.04100749
## class counts: 166 31
## probabilities: 0.843 0.157
## left son=8462 (11 obs) right son=8463 (186 obs)
## Primary splits:
## NDSSName.my.fctr#Multimedia# < 0.5 to the right, improve=0.5769882, (0 missing)
## NDSSName.my.fctrMetro#N.Y./Region# < 0.5 to the right, improve=0.5314217, (0 missing)
## WordCount.root2 < 30.09153 to the right, improve=0.4682049, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=0.4106319, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness < 0.5 to the right, improve=0.1814254, (0 missing)
##
## Node number 4254: 30 observations, complexity param=0.0005793743
## predicted class=N expected loss=0.4333333 P(node) =0.006244796
## class counts: 17 13
## probabilities: 0.567 0.433
## left son=8508 (7 obs) right son=8509 (23 obs)
## Primary splits:
## WordCount.root2 < 37.14159 to the left, improve=0.3979296, (0 missing)
##
## Node number 4255: 11 observations
## predicted class=Y expected loss=0.3636364 P(node) =0.002289759
## class counts: 4 7
## probabilities: 0.364 0.636
##
## Node number 8462: 11 observations
## predicted class=N expected loss=0 P(node) =0.002289759
## class counts: 11 0
## probabilities: 1.000 0.000
##
## Node number 8463: 186 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.1666667 P(node) =0.03871774
## class counts: 155 31
## probabilities: 0.833 0.167
## left son=16926 (29 obs) right son=16927 (157 obs)
## Primary splits:
## NDSSName.my.fctrMetro#N.Y./Region# < 0.5 to the right, improve=0.6559045, (0 missing)
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=0.4920635, (0 missing)
## WordCount.root2 < 30.09153 to the right, improve=0.3890196, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness < 0.5 to the right, improve=0.2415584, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook < 0.5 to the left, improve=0.0126479, (0 missing)
##
## Node number 8508: 7 observations
## predicted class=N expected loss=0.2857143 P(node) =0.001457119
## class counts: 5 2
## probabilities: 0.714 0.286
##
## Node number 8509: 23 observations, complexity param=0.0005793743
## predicted class=N expected loss=0.4782609 P(node) =0.004787677
## class counts: 12 11
## probabilities: 0.522 0.478
## left son=17018 (8 obs) right son=17019 (15 obs)
## Primary splits:
## WordCount.root2 < 38.57459 to the right, improve=0.2615942, (0 missing)
##
## Node number 16926: 29 observations
## predicted class=N expected loss=0.06896552 P(node) =0.006036636
## class counts: 27 2
## probabilities: 0.931 0.069
##
## Node number 16927: 157 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.1847134 P(node) =0.0326811
## class counts: 128 29
## probabilities: 0.815 0.185
## left son=33854 (18 obs) right son=33855 (139 obs)
## Primary splits:
## NDSSName.my.fctrForeign#World#AsiaPacific < 0.5 to the right, improve=0.67831090, (0 missing)
## WordCount.root2 < 32.38827 to the left, improve=0.61044970, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness < 0.5 to the right, improve=0.38816480, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook < 0.5 to the right, improve=0.01539613, (0 missing)
## Surrogate splits:
## WordCount.root2 < 32.51922 to the right, agree=0.892, adj=0.056, (0 split)
##
## Node number 17018: 8 observations
## predicted class=N expected loss=0.375 P(node) =0.001665279
## class counts: 5 3
## probabilities: 0.625 0.375
##
## Node number 17019: 15 observations
## predicted class=Y expected loss=0.4666667 P(node) =0.003122398
## class counts: 7 8
## probabilities: 0.467 0.533
##
## Node number 33854: 18 observations
## predicted class=N expected loss=0.05555556 P(node) =0.003746878
## class counts: 17 1
## probabilities: 0.944 0.056
##
## Node number 33855: 139 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.2014388 P(node) =0.02893422
## class counts: 111 28
## probabilities: 0.799 0.201
## left son=67710 (102 obs) right son=67711 (37 obs)
## Primary splits:
## WordCount.root2 < 30.09153 to the right, improve=0.9266317, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness < 0.5 to the right, improve=0.5580040, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook < 0.5 to the right, improve=0.1306354, (0 missing)
##
## Node number 67710: 102 observations
## predicted class=N expected loss=0.1666667 P(node) =0.02123231
## class counts: 85 17
## probabilities: 0.833 0.167
##
## Node number 67711: 37 observations, complexity param=0.0004213631
## predicted class=N expected loss=0.2972973 P(node) =0.007701915
## class counts: 26 11
## probabilities: 0.703 0.297
## left son=135422 (30 obs) right son=135423 (7 obs)
## Primary splits:
## WordCount.root2 < 29.92488 to the left, improve=3.00231700, (0 missing)
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook < 0.5 to the left, improve=0.01303089, (0 missing)
##
## Node number 135422: 30 observations
## predicted class=N expected loss=0.2 P(node) =0.006244796
## class counts: 24 6
## probabilities: 0.800 0.200
##
## Node number 135423: 7 observations
## predicted class=Y expected loss=0.2857143 P(node) =0.001457119
## class counts: 2 5
## probabilities: 0.286 0.714
##
## n= 4804
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4804 863 N (0.82035803 0.17964197)
## 2) NDSSName.my.fctrOpEd#Opinion#< 0.5 4367 485 N (0.88893978 0.11106022)
## 4) NDSSName.my.fctrBusiness#Crosswords/Games#< 0.5 4262 390 N (0.90849366 0.09150634)
## 8) NDSSName.my.fctrScience#Health#< 0.5 4114 279 N (0.93218279 0.06781721)
## 16) NDSSName.my.fctrStyles#U.S.#< 0.5 3987 191 N (0.95209431 0.04790569)
## 32) WordCount.root2< 25.01 2982 38 N (0.98725687 0.01274313) *
## 33) WordCount.root2>=25.01 1005 153 N (0.84776119 0.15223881)
## 66) NDSSName.my.fctr#Opinion#ThePublicEditor< 0.5 993 142 N (0.85699899 0.14300101)
## 132) NDSSName.my.fctrCulture#Arts#< 0.5 930 122 N (0.86881720 0.13118280)
## 264) WordCount.root2< 33.97057 627 65 N (0.89633174 0.10366826)
## 528) NDSSName.my.fctrBusiness#Technology#< 0.5 561 49 N (0.91265597 0.08734403)
## 1056) WordCount.root2< 29.33428 281 14 N (0.95017794 0.04982206) *
## 1057) WordCount.root2>=29.33428 280 35 N (0.87500000 0.12500000)
## 2114) WordCount.root2>=32.57299 71 4 N (0.94366197 0.05633803) *
## 2115) WordCount.root2< 32.57299 209 31 N (0.85167464 0.14832536)
## 4230) NDSSName.my.fctrTStyle##>=0.5 12 0 N (1.00000000 0.00000000) *
## 4231) NDSSName.my.fctrTStyle##< 0.5 197 31 N (0.84263959 0.15736041)
## 8462) NDSSName.my.fctr#Multimedia#>=0.5 11 0 N (1.00000000 0.00000000) *
## 8463) NDSSName.my.fctr#Multimedia#< 0.5 186 31 N (0.83333333 0.16666667)
## 16926) NDSSName.my.fctrMetro#N.Y./Region#>=0.5 29 2 N (0.93103448 0.06896552) *
## 16927) NDSSName.my.fctrMetro#N.Y./Region#< 0.5 157 29 N (0.81528662 0.18471338)
## 33854) NDSSName.my.fctrForeign#World#AsiaPacific>=0.5 18 1 N (0.94444444 0.05555556) *
## 33855) NDSSName.my.fctrForeign#World#AsiaPacific< 0.5 139 28 N (0.79856115 0.20143885)
## 67710) WordCount.root2>=30.09153 102 17 N (0.83333333 0.16666667) *
## 67711) WordCount.root2< 30.09153 37 11 N (0.70270270 0.29729730)
## 135422) WordCount.root2< 29.92488 30 6 N (0.80000000 0.20000000) *
## 135423) WordCount.root2>=29.92488 7 2 Y (0.28571429 0.71428571) *
## 529) NDSSName.my.fctrBusiness#Technology#>=0.5 66 16 N (0.75757576 0.24242424)
## 1058) WordCount.root2< 27.86575 38 7 N (0.81578947 0.18421053) *
## 1059) WordCount.root2>=27.86575 28 9 N (0.67857143 0.32142857)
## 2118) WordCount.root2>=28.6269 19 4 N (0.78947368 0.21052632) *
## 2119) WordCount.root2< 28.6269 9 4 Y (0.44444444 0.55555556) *
## 265) WordCount.root2>=33.97057 303 57 N (0.81188119 0.18811881)
## 530) NDSSName.my.fctrBusiness#BusinessDay#Dealbook< 0.5 222 29 N (0.86936937 0.13063063) *
## 531) NDSSName.my.fctrBusiness#BusinessDay#Dealbook>=0.5 81 28 N (0.65432099 0.34567901)
## 1062) WordCount.root2>=41.59766 15 1 N (0.93333333 0.06666667) *
## 1063) WordCount.root2< 41.59766 66 27 N (0.59090909 0.40909091)
## 2126) WordCount.root2< 35.6581 25 7 N (0.72000000 0.28000000) *
## 2127) WordCount.root2>=35.6581 41 20 N (0.51219512 0.48780488)
## 4254) WordCount.root2>=36.31791 30 13 N (0.56666667 0.43333333)
## 8508) WordCount.root2< 37.14159 7 2 N (0.71428571 0.28571429) *
## 8509) WordCount.root2>=37.14159 23 11 N (0.52173913 0.47826087)
## 17018) WordCount.root2>=38.57459 8 3 N (0.62500000 0.37500000) *
## 17019) WordCount.root2< 38.57459 15 7 Y (0.46666667 0.53333333) *
## 4255) WordCount.root2< 36.31791 11 4 Y (0.36363636 0.63636364) *
## 133) NDSSName.my.fctrCulture#Arts#>=0.5 63 20 N (0.68253968 0.31746032)
## 266) WordCount.root2< 26.99984 14 3 N (0.78571429 0.21428571) *
## 267) WordCount.root2>=26.99984 49 17 N (0.65306122 0.34693878)
## 534) WordCount.root2>=41.56249 10 2 N (0.80000000 0.20000000) *
## 535) WordCount.root2< 41.56249 39 15 N (0.61538462 0.38461538)
## 1070) WordCount.root2< 34.23387 32 11 N (0.65625000 0.34375000) *
## 1071) WordCount.root2>=34.23387 7 3 Y (0.42857143 0.57142857) *
## 67) NDSSName.my.fctr#Opinion#ThePublicEditor>=0.5 12 1 Y (0.08333333 0.91666667) *
## 17) NDSSName.my.fctrStyles#U.S.#>=0.5 127 39 Y (0.30708661 0.69291339)
## 34) WordCount.root2< 15.32846 13 5 N (0.61538462 0.38461538) *
## 35) WordCount.root2>=15.32846 114 31 Y (0.27192982 0.72807018)
## 70) WordCount.root2< 29.21444 79 25 Y (0.31645570 0.68354430)
## 140) WordCount.root2>=27.36786 25 10 Y (0.40000000 0.60000000)
## 280) WordCount.root2< 28.02674 8 3 N (0.62500000 0.37500000) *
## 281) WordCount.root2>=28.02674 17 5 Y (0.29411765 0.70588235) *
## 141) WordCount.root2< 27.36786 54 15 Y (0.27777778 0.72222222)
## 282) WordCount.root2< 26.55173 45 14 Y (0.31111111 0.68888889)
## 564) WordCount.root2>=21.70252 23 9 Y (0.39130435 0.60869565)
## 1128) WordCount.root2< 23.6326 7 3 N (0.57142857 0.42857143) *
## 1129) WordCount.root2>=23.6326 16 5 Y (0.31250000 0.68750000) *
## 565) WordCount.root2< 21.70252 22 5 Y (0.22727273 0.77272727) *
## 283) WordCount.root2>=26.55173 9 1 Y (0.11111111 0.88888889) *
## 71) WordCount.root2>=29.21444 35 6 Y (0.17142857 0.82857143) *
## 9) NDSSName.my.fctrScience#Health#>=0.5 148 37 Y (0.25000000 0.75000000)
## 18) WordCount.root2< 22.72663 55 23 N (0.58181818 0.41818182)
## 36) WordCount.root2>=19.93708 9 2 N (0.77777778 0.22222222) *
## 37) WordCount.root2< 19.93708 46 21 N (0.54347826 0.45652174)
## 74) WordCount.root2< 17.01454 36 14 N (0.61111111 0.38888889)
## 148) WordCount.root2>=15.74773 8 2 N (0.75000000 0.25000000) *
## 149) WordCount.root2< 15.74773 28 12 N (0.57142857 0.42857143)
## 298) WordCount.root2< 15.06648 20 7 N (0.65000000 0.35000000) *
## 299) WordCount.root2>=15.06648 8 3 Y (0.37500000 0.62500000) *
## 75) WordCount.root2>=17.01454 10 3 Y (0.30000000 0.70000000) *
## 19) WordCount.root2>=22.72663 93 5 Y (0.05376344 0.94623656) *
## 5) NDSSName.my.fctrBusiness#Crosswords/Games#>=0.5 105 10 Y (0.09523810 0.90476190)
## 10) WordCount.root2< 18.9043 12 5 N (0.58333333 0.41666667) *
## 11) WordCount.root2>=18.9043 93 3 Y (0.03225806 0.96774194) *
## 3) NDSSName.my.fctrOpEd#Opinion#>=0.5 437 59 Y (0.13501144 0.86498856) *
## Prediction
## Reference N Y
## N 3814 127
## Y 170 693
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.381765e-01 7.860827e-01 9.309917e-01 9.448229e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 2.798570e-127 1.480611e-02
## Prediction
## Reference N Y
## N 1180 318
## Y 84 146
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.673611e-01 2.953321e-01 7.467059e-01 7.871043e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 3.224022e-31
## id feats
## 1 Max.cor.Y.rcv.1X1.cp.0###rpart WordCount.root2,NDSSName.my.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 0 0.994 0.076
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8821543 0.9705658 0.7937428 0.9504198
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.4 0.8235294 0.9381765
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9309917 0.9448229 0.7860827
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.6174697 0.9218959 0.3130435 0.7773858
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4207493 0.7673611
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7467059 0.7871043 0.2953321
#stop(here"); glb2Sav(); all.equal(glb_models_df, sav_models_df)
# if (glb_is_regression || glb_is_binomial) # For multinomials this model will be run next by default
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Max.cor.Y",
type=glb_model_type, trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds, trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = FALSE,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method="rpart")),
indep_vars=max_cor_y_x_vars, rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
## [1] "fitting model: Max.cor.Y##rcv#rpart"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr"
## + Fold1.Rep1: cp=0.01043
## - Fold1.Rep1: cp=0.01043
## + Fold2.Rep1: cp=0.01043
## - Fold2.Rep1: cp=0.01043
## + Fold3.Rep1: cp=0.01043
## - Fold3.Rep1: cp=0.01043
## + Fold1.Rep2: cp=0.01043
## - Fold1.Rep2: cp=0.01043
## + Fold2.Rep2: cp=0.01043
## - Fold2.Rep2: cp=0.01043
## + Fold3.Rep2: cp=0.01043
## - Fold3.Rep2: cp=0.01043
## + Fold1.Rep3: cp=0.01043
## - Fold1.Rep3: cp=0.01043
## + Fold2.Rep3: cp=0.01043
## - Fold2.Rep3: cp=0.01043
## + Fold3.Rep3: cp=0.01043
## - Fold3.Rep3: cp=0.01043
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.0104 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Max.cor.Y", : model's bestTune found at an extreme of
## tuneGrid for parameter: cp
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 4804
##
## CP nsplit rel error
## 1 0.36964079 0 1.0000000
## 2 0.09849363 1 0.6303592
## 3 0.08574739 2 0.5318656
## 4 0.05677868 3 0.4461182
## 5 0.01042874 4 0.3893395
##
## Variable importance
## NDSSName.my.fctrOpEd#Opinion#
## 55
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 16
## NDSSName.my.fctrScience#Health#
## 16
## NDSSName.my.fctrStyles#U.S.#
## 12
##
## Node number 1: 4804 observations, complexity param=0.3696408
## predicted class=N expected loss=0.179642 P(node) =1
## class counts: 3941 863
## probabilities: 0.820 0.180
## left son=2 (4367 obs) right son=3 (437 obs)
## Primary splits:
## NDSSName.my.fctrOpEd#Opinion# < 0.5 to the left, improve=451.59770, (0 missing)
## NDSSName.my.fctrBusiness#Crosswords/Games# < 0.5 to the left, improve=112.88510, (0 missing)
## WordCount.root2 < 25.75849 to the left, improve=111.17610, (0 missing)
## NDSSName.my.fctrScience#Health# < 0.5 to the left, improve= 99.35206, (0 missing)
## NDSSName.my.fctrStyles#U.S.# < 0.5 to the left, improve= 68.73272, (0 missing)
##
## Node number 2: 4367 observations, complexity param=0.09849363
## predicted class=N expected loss=0.1110602 P(node) =0.9090341
## class counts: 3882 485
## probabilities: 0.889 0.111
## left son=4 (4262 obs) right son=5 (105 obs)
## Primary splits:
## NDSSName.my.fctrBusiness#Crosswords/Games# < 0.5 to the left, improve=135.55130, (0 missing)
## NDSSName.my.fctrScience#Health# < 0.5 to the left, improve=125.07920, (0 missing)
## WordCount.root2 < 25.75849 to the left, improve= 94.70710, (0 missing)
## NDSSName.my.fctrStyles#U.S.# < 0.5 to the left, improve= 88.56821, (0 missing)
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, improve= 18.74400, (0 missing)
##
## Node number 3: 437 observations
## predicted class=Y expected loss=0.1350114 P(node) =0.09096586
## class counts: 59 378
## probabilities: 0.135 0.865
##
## Node number 4: 4262 observations, complexity param=0.08574739
## predicted class=N expected loss=0.09150634 P(node) =0.8871774
## class counts: 3872 390
## probabilities: 0.908 0.092
## left son=8 (4114 obs) right son=9 (148 obs)
## Primary splits:
## NDSSName.my.fctrScience#Health# < 0.5 to the left, improve=132.96710, (0 missing)
## NDSSName.my.fctrStyles#U.S.# < 0.5 to the left, improve= 94.69099, (0 missing)
## WordCount.root2 < 26.49528 to the left, improve= 84.07487, (0 missing)
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, improve= 19.71762, (0 missing)
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve= 10.17000, (0 missing)
##
## Node number 5: 105 observations
## predicted class=Y expected loss=0.0952381 P(node) =0.02185679
## class counts: 10 95
## probabilities: 0.095 0.905
##
## Node number 8: 4114 observations, complexity param=0.05677868
## predicted class=N expected loss=0.06781721 P(node) =0.8563697
## class counts: 3835 279
## probabilities: 0.932 0.068
## left son=16 (3987 obs) right son=17 (127 obs)
## Primary splits:
## NDSSName.my.fctrStyles#U.S.# < 0.5 to the left, improve=102.410700, (0 missing)
## WordCount.root2 < 25.01 to the left, improve= 47.352210, (0 missing)
## NDSSName.my.fctr#Opinion#ThePublicEditor < 0.5 to the left, improve= 20.930810, (0 missing)
## NDSSName.my.fctrTStyle## < 0.5 to the right, improve= 5.249425, (0 missing)
## NDSSName.my.fctrBusiness#Technology# < 0.5 to the left, improve= 2.395935, (0 missing)
##
## Node number 9: 148 observations
## predicted class=Y expected loss=0.25 P(node) =0.03080766
## class counts: 37 111
## probabilities: 0.250 0.750
##
## Node number 16: 3987 observations
## predicted class=N expected loss=0.04790569 P(node) =0.8299334
## class counts: 3796 191
## probabilities: 0.952 0.048
##
## Node number 17: 127 observations
## predicted class=Y expected loss=0.3070866 P(node) =0.0264363
## class counts: 39 88
## probabilities: 0.307 0.693
##
## n= 4804
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4804 863 N (0.82035803 0.17964197)
## 2) NDSSName.my.fctrOpEd#Opinion#< 0.5 4367 485 N (0.88893978 0.11106022)
## 4) NDSSName.my.fctrBusiness#Crosswords/Games#< 0.5 4262 390 N (0.90849366 0.09150634)
## 8) NDSSName.my.fctrScience#Health#< 0.5 4114 279 N (0.93218279 0.06781721)
## 16) NDSSName.my.fctrStyles#U.S.#< 0.5 3987 191 N (0.95209431 0.04790569) *
## 17) NDSSName.my.fctrStyles#U.S.#>=0.5 127 39 Y (0.30708661 0.69291339) *
## 9) NDSSName.my.fctrScience#Health#>=0.5 148 37 Y (0.25000000 0.75000000) *
## 5) NDSSName.my.fctrBusiness#Crosswords/Games#>=0.5 105 10 Y (0.09523810 0.90476190) *
## 3) NDSSName.my.fctrOpEd#Opinion#>=0.5 437 59 Y (0.13501144 0.86498856) *
## Prediction
## Reference N Y
## N 3796 145
## Y 191 672
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.300583e-01 7.576571e-01 9.224771e-01 9.371115e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 4.458834e-108 1.409037e-02
## Prediction
## Reference N Y
## N 1355 143
## Y 168 62
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.8200231 0.1825002 0.8010821 0.8378705 0.8668981
## AccuracyPValue McnemarPValue
## 1.0000000 0.1735405
## id feats max.nTuningRuns
## 1 Max.cor.Y##rcv#rpart WordCount.root2,NDSSName.my.fctr 5
## min.elapsedtime.everything min.elapsedtime.final max.AUCpROC.fit
## 1 3.519 0.074 0.8709432
## max.Sens.fit max.Spec.fit max.AUCROCR.fit opt.prob.threshold.fit
## 1 0.9632073 0.778679 0.8746354 0.6
## max.f.score.fit max.Accuracy.fit max.AccuracyLower.fit
## 1 0.8 0.9296422 0.9224771
## max.AccuracyUpper.fit max.Kappa.fit max.AUCpROC.OOB max.Sens.OOB
## 1 0.9371115 0.7515134 0.5870523 0.9045394
## max.Spec.OOB max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB
## 1 0.2695652 0.5892132 0.6 0.2850575
## max.Accuracy.OOB max.AccuracyLower.OOB max.AccuracyUpper.OOB
## 1 0.8200231 0.8010821 0.8378705
## max.Kappa.OOB max.AccuracySD.fit max.KappaSD.fit
## 1 0.1825002 0.00506952 0.0191091
if ((length(glbFeatsDateTime) > 0) &&
(sum(grepl(paste(names(glbFeatsDateTime), "\\.day\\.minutes\\.poly\\.", sep = ""),
names(glbObsAll))) > 0)) {
indepVars <- c(max_cor_y_x_vars,
grep(paste(names(glbFeatsDateTime), "\\.day\\.minutes\\.poly\\.", sep = ""),
names(glbObsAll), value = TRUE))
indepVars <- myadjust_interaction_feats(indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Time.Poly",
type = glb_model_type, trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indep_vars = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
## [1] "fitting model: Max.cor.Y.Time.Poly##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.day.minutes.poly.2,PubDate.day.minutes.poly.3,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.0201 on full training set
## Length Class Mode
## a0 100 -none- numeric
## beta 13100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 131 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.80640970
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.32799873
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 2.71895558
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.69928283
## NDSSName.my.fctrOpEd#Opinion#
## 4.00043819
## NDSSName.my.fctrScience#Health#
## 3.10750415
## NDSSName.my.fctrStyles#U.S.#
## 2.85717557
## NDSSName.my.fctrTStyle##
## -0.14606302
## PubDate.day.minutes.poly.1
## 5.57604425
## WordCount.root2
## 0.05218571
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg
## 0.75372185
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.88967393
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.48865614
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 2.84279682
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.76936872
## NDSSName.my.fctrBusiness#Technology#
## 0.09968733
## NDSSName.my.fctrOpEd#Opinion#
## 4.06768787
## NDSSName.my.fctrScience#Health#
## 3.17511582
## NDSSName.my.fctrStyles#U.S.#
## 2.92206821
## NDSSName.my.fctrTStyle##
## -0.20202194
## PubDate.day.minutes.poly.1
## 7.01701251
## WordCount.root2
## 0.05458241
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg
## 3.21496869
## Prediction
## Reference N Y
## N 3797 144
## Y 177 686
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.331807e-01 7.698585e-01 9.257484e-01 9.400811e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 3.361187e-115 7.408860e-02
## Prediction
## Reference N Y
## N 1200 298
## Y 90 140
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.754630e-01 2.963401e-01 7.550404e-01 7.949457e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 7.866103e-26
## id
## 1 Max.cor.Y.Time.Poly##rcv#glmnet
## feats
## 1 WordCount.root2,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.day.minutes.poly.2,PubDate.day.minutes.poly.3,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 13.151 1.697
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8734975 0.9659985 0.7809965 0.9534659
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.4 0.8103957 0.931932
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9257484 0.9400811 0.762829
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5950717 0.9118825 0.2782609 0.7997373
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4191617 0.775463
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7550404 0.7949457 0.2963401
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.005321823 0.01901796
if ((length(glbFeatsDateTime) > 0) &&
(sum(grepl(paste(names(glbFeatsDateTime), "\\.last[[:digit:]]", sep = ""),
names(glbObsAll))) > 0)) {
indepVars <- c(max_cor_y_x_vars,
grep(paste(names(glbFeatsDateTime), "\\.last[[:digit:]]", sep = ""),
names(glbObsAll), value = TRUE))
indepVars <- myadjust_interaction_feats(indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Time.Lag",
type = glb_model_type,
tune.df = glmnet_tune_models_df,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indep_vars = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
## [1] "fitting model: Max.cor.Y.Time.Lag##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr,PubDate.last2.log1p,PubDate.last4.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.last32.log1p,NDSSName.my.fctr:PubDate.last32.log1p.ctg,NDSSName.my.fctr:PubDate.last16.log1p.ctg,NDSSName.my.fctr:PubDate.last2.log1p.ctg,NDSSName.my.fctr:PubDate.last4.log1p.ctg,NDSSName.my.fctr:PubDate.last8.log1p.ctg"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0934 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Max.cor.Y.Time.Lag", : model's bestTune found at an
## extreme of tuneGrid for parameter: alpha
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Max.cor.Y.Time.Lag", : model's bestTune found at an
## extreme of tuneGrid for parameter: lambda
## Length Class Mode
## a0 100 -none- numeric
## beta 13100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 131 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.3346728432
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.0700552892
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 0.6960494506
## NDSSName.my.fctr#U.S.#Education
## -0.0690824805
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 0.6932362214
## NDSSName.my.fctrBusiness#Technology#
## 0.0213652189
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.0400198990
## NDSSName.my.fctrOpEd#Opinion#
## 0.7869504896
## NDSSName.my.fctrScience#Health#
## 0.8481886152
## NDSSName.my.fctrStyles##Fashion
## -0.0328362133
## NDSSName.my.fctrStyles#U.S.#
## 0.6561242230
## NDSSName.my.fctrTStyle##
## -0.0981051454
## PubDate.last2.log1p
## 0.0092548859
## PubDate.last4.log1p
## 0.0102871187
## PubDate.last8.log1p
## 0.0024984935
## WordCount.root2
## 0.0301342902
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last16.log1p.ctg
## -0.0035122093
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last16.log1p.ctg
## 0.0433276050
## NDSSName.my.fctr#U.S.#Education:PubDate.last16.log1p.ctg
## -0.0041927390
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last16.log1p.ctg
## 0.0484243489
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last16.log1p.ctg
## -0.0028368765
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last16.log1p.ctg
## 0.0461504248
## NDSSName.my.fctrScience#Health#:PubDate.last16.log1p.ctg
## 0.0309830777
## NDSSName.my.fctrStyles##Fashion:PubDate.last16.log1p.ctg
## -0.0006575310
## NDSSName.my.fctrStyles#U.S.#:PubDate.last16.log1p.ctg
## 0.0239742568
## NDSSName.my.fctrTStyle##:PubDate.last16.log1p.ctg
## -0.0065933198
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last2.log1p.ctg
## -0.0056903861
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last2.log1p.ctg
## 0.0314132550
## NDSSName.my.fctr#U.S.#Education:PubDate.last2.log1p.ctg
## -0.0062595339
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last2.log1p.ctg
## 0.0683177942
## NDSSName.my.fctrBusiness#Technology#:PubDate.last2.log1p.ctg
## 0.0089001176
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last2.log1p.ctg
## -0.0039023477
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last2.log1p.ctg
## 0.0677459270
## NDSSName.my.fctrScience#Health#:PubDate.last2.log1p.ctg
## 0.0530838737
## NDSSName.my.fctrStyles##Fashion:PubDate.last2.log1p.ctg
## -0.0015560741
## NDSSName.my.fctrStyles#U.S.#:PubDate.last2.log1p.ctg
## 0.0595010497
## NDSSName.my.fctrTStyle##:PubDate.last2.log1p.ctg
## -0.0094780945
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last32.log1p.ctg
## -0.0036496827
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last32.log1p.ctg
## 0.0438906916
## NDSSName.my.fctr#U.S.#Education:PubDate.last32.log1p.ctg
## -0.0028934662
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last32.log1p.ctg
## 0.0420188172
## NDSSName.my.fctrBusiness#Technology#:PubDate.last32.log1p.ctg
## 0.0001053778
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last32.log1p.ctg
## -0.0002655336
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last32.log1p.ctg
## 0.0399526463
## NDSSName.my.fctrScience#Health#:PubDate.last32.log1p.ctg
## 0.0309010857
## NDSSName.my.fctrStyles#U.S.#:PubDate.last32.log1p.ctg
## 0.0211726295
## NDSSName.my.fctrTStyle##:PubDate.last32.log1p.ctg
## -0.0055085967
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last4.log1p.ctg
## -0.0054198705
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last4.log1p.ctg
## 0.0332806264
## NDSSName.my.fctr#U.S.#Education:PubDate.last4.log1p.ctg
## -0.0058751936
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last4.log1p.ctg
## 0.0637336391
## NDSSName.my.fctrBusiness#Technology#:PubDate.last4.log1p.ctg
## 0.0052011593
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last4.log1p.ctg
## -0.0038587969
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last4.log1p.ctg
## 0.0596874074
## NDSSName.my.fctrScience#Health#:PubDate.last4.log1p.ctg
## 0.0396819084
## NDSSName.my.fctrStyles##Fashion:PubDate.last4.log1p.ctg
## -0.0013725654
## NDSSName.my.fctrStyles#U.S.#:PubDate.last4.log1p.ctg
## 0.0532860460
## NDSSName.my.fctrTStyle##:PubDate.last4.log1p.ctg
## -0.0082041791
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last8.log1p.ctg
## -0.0051487144
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last8.log1p.ctg
## 0.0400352897
## NDSSName.my.fctr#U.S.#Education:PubDate.last8.log1p.ctg
## -0.0050160685
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last8.log1p.ctg
## 0.0486360490
## NDSSName.my.fctrBusiness#Technology#:PubDate.last8.log1p.ctg
## 0.0032553238
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last8.log1p.ctg
## -0.0037797318
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last8.log1p.ctg
## 0.0530103663
## NDSSName.my.fctrScience#Health#:PubDate.last8.log1p.ctg
## 0.0378007768
## NDSSName.my.fctrStyles##Fashion:PubDate.last8.log1p.ctg
## -0.0013683257
## NDSSName.my.fctrStyles#U.S.#:PubDate.last8.log1p.ctg
## 0.0359367759
## NDSSName.my.fctrTStyle##:PubDate.last8.log1p.ctg
## -0.0076052662
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.4741443935
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.0913108705
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 0.7302582708
## NDSSName.my.fctr#U.S.#Education
## -0.0799432875
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 0.7098915737
## NDSSName.my.fctrBusiness#Technology#
## 0.0313863839
## NDSSName.my.fctrForeign#World#
## -0.0169095302
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.0511873031
## NDSSName.my.fctrOpEd#Opinion#
## 0.8108543356
## NDSSName.my.fctrScience#Health#
## 0.8885013044
## NDSSName.my.fctrStyles##Fashion
## -0.0470453548
## NDSSName.my.fctrStyles#U.S.#
## 0.6815476676
## NDSSName.my.fctrTStyle##
## -0.1056040103
## PubDate.last2.log1p
## 0.0119991350
## PubDate.last4.log1p
## 0.0133329085
## PubDate.last8.log1p
## 0.0064217053
## WordCount.root2
## 0.0320610100
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last16.log1p.ctg
## -0.0050988188
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last16.log1p.ctg
## 0.0448997743
## NDSSName.my.fctr#U.S.#Education:PubDate.last16.log1p.ctg
## -0.0050084105
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last16.log1p.ctg
## 0.0491525599
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last16.log1p.ctg
## -0.0037475441
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last16.log1p.ctg
## 0.0465297477
## NDSSName.my.fctrScience#Health#:PubDate.last16.log1p.ctg
## 0.0307757995
## NDSSName.my.fctrStyles##Fashion:PubDate.last16.log1p.ctg
## -0.0017151206
## NDSSName.my.fctrStyles#U.S.#:PubDate.last16.log1p.ctg
## 0.0234485995
## NDSSName.my.fctrTStyle##:PubDate.last16.log1p.ctg
## -0.0071212577
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last2.log1p.ctg
## -0.0077599233
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last2.log1p.ctg
## 0.0315315004
## NDSSName.my.fctr#U.S.#Education:PubDate.last2.log1p.ctg
## -0.0073500972
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last2.log1p.ctg
## 0.0699029286
## NDSSName.my.fctrBusiness#Technology#:PubDate.last2.log1p.ctg
## 0.0104997991
## NDSSName.my.fctrForeign#World#:PubDate.last2.log1p.ctg
## -0.0002079675
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last2.log1p.ctg
## -0.0050496337
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last2.log1p.ctg
## 0.0690992956
## NDSSName.my.fctrScience#Health#:PubDate.last2.log1p.ctg
## 0.0541552574
## NDSSName.my.fctrStyles##Fashion:PubDate.last2.log1p.ctg
## -0.0029060086
## NDSSName.my.fctrStyles#U.S.#:PubDate.last2.log1p.ctg
## 0.0614899139
## NDSSName.my.fctrTStyle##:PubDate.last2.log1p.ctg
## -0.0102372000
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last32.log1p.ctg
## -0.0051820958
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last32.log1p.ctg
## 0.0456360744
## NDSSName.my.fctr#U.S.#Education:PubDate.last32.log1p.ctg
## -0.0036536131
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last32.log1p.ctg
## 0.0424580432
## NDSSName.my.fctrBusiness#Technology#:PubDate.last32.log1p.ctg
## 0.0007660277
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last32.log1p.ctg
## -0.0010335475
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last32.log1p.ctg
## 0.0400154206
## NDSSName.my.fctrScience#Health#:PubDate.last32.log1p.ctg
## 0.0309238460
## NDSSName.my.fctrStyles##Fashion:PubDate.last32.log1p.ctg
## -0.0002014750
## NDSSName.my.fctrStyles#U.S.#:PubDate.last32.log1p.ctg
## 0.0205882818
## NDSSName.my.fctrTStyle##:PubDate.last32.log1p.ctg
## -0.0059487936
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last4.log1p.ctg
## -0.0073308225
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last4.log1p.ctg
## 0.0336573160
## NDSSName.my.fctr#U.S.#Education:PubDate.last4.log1p.ctg
## -0.0068682427
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last4.log1p.ctg
## 0.0652303875
## NDSSName.my.fctrBusiness#Technology#:PubDate.last4.log1p.ctg
## 0.0064111433
## NDSSName.my.fctrForeign#World#:PubDate.last4.log1p.ctg
## -0.0009097510
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last4.log1p.ctg
## -0.0049209363
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last4.log1p.ctg
## 0.0606305304
## NDSSName.my.fctrScience#Health#:PubDate.last4.log1p.ctg
## 0.0397307372
## NDSSName.my.fctrStyles##Fashion:PubDate.last4.log1p.ctg
## -0.0025762211
## NDSSName.my.fctrStyles#U.S.#:PubDate.last4.log1p.ctg
## 0.0548717296
## NDSSName.my.fctrTStyle##:PubDate.last4.log1p.ctg
## -0.0088553088
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last8.log1p.ctg
## -0.0069262052
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last8.log1p.ctg
## 0.0411687878
## NDSSName.my.fctr#U.S.#Education:PubDate.last8.log1p.ctg
## -0.0058924220
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last8.log1p.ctg
## 0.0491118878
## NDSSName.my.fctrBusiness#Technology#:PubDate.last8.log1p.ctg
## 0.0042430248
## NDSSName.my.fctrForeign#World#:PubDate.last8.log1p.ctg
## -0.0006436969
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last8.log1p.ctg
## -0.0047925039
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last8.log1p.ctg
## 0.0537292903
## NDSSName.my.fctrScience#Health#:PubDate.last8.log1p.ctg
## 0.0379358610
## NDSSName.my.fctrStyles##Fashion:PubDate.last8.log1p.ctg
## -0.0024919231
## NDSSName.my.fctrStyles#U.S.#:PubDate.last8.log1p.ctg
## 0.0362319863
## NDSSName.my.fctrTStyle##:PubDate.last8.log1p.ctg
## -0.0082143001
## Prediction
## Reference N Y
## N 3791 150
## Y 177 686
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.319317e-01 7.662012e-01 9.244394e-01 9.388938e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 2.593356e-112 1.504899e-01
## Prediction
## Reference N Y
## N 1219 279
## Y 98 132
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.818287e-01 2.908072e-01 7.615963e-01 8.010994e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 1.853246e-20
## id
## 1 Max.cor.Y.Time.Lag##rcv#glmnet
## feats
## 1 WordCount.root2,NDSSName.my.fctr,PubDate.last2.log1p,PubDate.last4.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.last32.log1p,NDSSName.my.fctr:PubDate.last32.log1p.ctg,NDSSName.my.fctr:PubDate.last16.log1p.ctg,NDSSName.my.fctr:PubDate.last2.log1p.ctg,NDSSName.my.fctr:PubDate.last4.log1p.ctg,NDSSName.my.fctr:PubDate.last8.log1p.ctg
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 5 42.176 2.825
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8621088 0.9652372 0.7589803 0.9558908
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.8075338 0.9279769
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9244394 0.9388938 0.7473218
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5927265 0.9158879 0.2695652 0.8024758
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4118565 0.7818287
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7615963 0.8010994 0.2908072
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.004678817 0.01649089
# Interactions.High.cor.Y
if (length(int_feats <- setdiff(setdiff(unique(glb_feats_df$cor.high.X), NA),
subset(glb_feats_df, nzv)$id)) > 0) {
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Interact.High.cor.Y",
type=glb_model_type, trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds, trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method="glmnet")),
indep_vars=c(max_cor_y_x_vars, paste(max_cor_y_x_vars[1], int_feats, sep=":")),
rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
}
## [1] "fitting model: Interact.High.cor.Y##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr,WordCount.root2:WordCount.root2,WordCount.root2:PubDate.day.minutes.poly.1,WordCount.root2:PubDate.last4.log1p,WordCount.root2:PubDate.month.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.000934 on full training set
## Length Class Mode
## a0 92 -none- numeric
## beta 2392 dgCMatrix S4
## df 92 -none- numeric
## dim 2 -none- numeric
## lambda 92 -none- numeric
## dev.ratio 92 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 26 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -4.8684452161
## NDSSName.my.fctr#Multimedia#
## -1.0487925341
## NDSSName.my.fctr#Opinion#RoomForDebate
## -5.4745572239
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 4.2885428864
## NDSSName.my.fctr#U.S.#Education
## -2.8325574053
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.3151450444
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.7572259115
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 4.3391439837
## NDSSName.my.fctrBusiness#Technology#
## 0.8045020529
## NDSSName.my.fctrCulture#Arts#
## -0.3229201625
## NDSSName.my.fctrForeign#World#
## -1.8392467451
## NDSSName.my.fctrForeign#World#AsiaPacific
## -1.8077498032
## NDSSName.my.fctrMetro#N.Y./Region#
## 0.2627263003
## NDSSName.my.fctrOpEd#Opinion#
## 4.7415088342
## NDSSName.my.fctrScience#Health#
## 3.7483695286
## NDSSName.my.fctrStyles##Fashion
## -2.4194040890
## NDSSName.my.fctrStyles#U.S.#
## 3.4229733393
## NDSSName.my.fctrTStyle##
## -2.0211781889
## NDSSName.my.fctrTravel#Travel#
## -1.7751305343
## NDSSName.my.fctrmyOther
## -2.3074809466
## WordCount.root2
## 0.0361357822
## WordCount.root2:PubDate.day.minutes.poly.1
## 1.0813556024
## WordCount.root2:PubDate.last4.log1p
## 0.0069832077
## WordCount.root2:PubDate.month.fctr10
## 0.0039570640
## WordCount.root2:PubDate.month.fctr11
## -0.0001256918
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -4.8709915569
## NDSSName.my.fctr#Multimedia#
## -1.0849702043
## NDSSName.my.fctr#Opinion#RoomForDebate
## -5.5929746399
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 4.2928665989
## NDSSName.my.fctr#U.S.#Education
## -2.9379189459
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.3276698231
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.7772032984
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 4.3366629069
## NDSSName.my.fctrBusiness#Technology#
## 0.8004536454
## NDSSName.my.fctrCulture#Arts#
## -0.3372244058
## NDSSName.my.fctrForeign#World#
## -1.9323067428
## NDSSName.my.fctrForeign#World#AsiaPacific
## -1.8410299268
## NDSSName.my.fctrMetro#N.Y./Region#
## 0.2641549470
## NDSSName.my.fctrOpEd#Opinion#
## 4.7402435031
## NDSSName.my.fctrScience#Health#
## 3.7456718341
## NDSSName.my.fctrStyles##Fashion
## -2.5160313263
## NDSSName.my.fctrStyles#U.S.#
## 3.4196115772
## NDSSName.my.fctrTStyle##
## -2.0543357159
## NDSSName.my.fctrTravel#Travel#
## -1.8687591112
## NDSSName.my.fctrmyOther
## -2.4107173389
## WordCount.root2
## 0.0361773869
## WordCount.root2:PubDate.day.minutes.poly.1
## 1.0887365081
## WordCount.root2:PubDate.last4.log1p
## 0.0070269546
## WordCount.root2:PubDate.month.fctr10
## 0.0039706413
## WordCount.root2:PubDate.month.fctr11
## -0.0002339144
## Prediction
## Reference N Y
## N 3787 154
## Y 173 690
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.319317e-01 7.670549e-01 9.244394e-01 9.388938e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 2.593356e-112 3.195407e-01
## Prediction
## Reference N Y
## N 1164 334
## Y 71 159
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.656250e-01 3.156027e-01 7.449213e-01 7.854227e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 9.555643e-39
## id
## 1 Interact.High.cor.Y##rcv#glmnet
## feats
## 1 WordCount.root2,NDSSName.my.fctr,WordCount.root2:WordCount.root2,WordCount.root2:PubDate.day.minutes.poly.1,WordCount.root2:PubDate.last4.log1p,WordCount.root2:PubDate.month.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 5.404 0.342
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8776419 0.9626998 0.792584 0.9625372
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.4 0.8084359 0.931585
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9244394 0.9388938 0.764104
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.6009259 0.9105474 0.2913043 0.8140971
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.439834 0.765625
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7449213 0.7854227 0.3156027
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.005250654 0.01810996
# Low.cor.X
# if (glb_is_classification && glb_is_binomial)
# indep_vars_vctr <- subset(glb_feats_df, is.na(cor.high.X) &
# is.ConditionalX.y &
# (exclude.as.feat != 1))[, "id"] else
indep_vars <- subset(glb_feats_df, is.na(cor.high.X) & !nzv &
(exclude.as.feat != 1))[, "id"]
indep_vars <- myadjust_interaction_feats(indep_vars)
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Low.cor.X",
type=glb_model_type,
tune.df = glmnet_tune_models_df,
trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds, trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method="glmnet")),
indep_vars=indep_vars, rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
## [1] "fitting model: Low.cor.X##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.last32.log1p.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.last16.log1p.ctg,NDSSName.my.fctr:PubDate.last2.log1p.ctg,NDSSName.my.fctr:PubDate.last4.log1p.ctg,NDSSName.my.fctr:PubDate.last8.log1p.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0934 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Low.cor.X", : model's bestTune found at an extreme of
## tuneGrid for parameter: alpha
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Low.cor.X", : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 100 -none- numeric
## beta 26100 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 261 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.202862e+00
## NDSSName.my.fctr#Opinion#RoomForDebate
## -8.521086e-02
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 6.880206e-01
## NDSSName.my.fctr#U.S.#Education
## -5.784059e-02
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 6.701988e-01
## NDSSName.my.fctrBusiness#Technology#
## 2.290481e-02
## NDSSName.my.fctrForeign#World#AsiaPacific
## -3.274668e-02
## NDSSName.my.fctrOpEd#Opinion#
## 7.842207e-01
## NDSSName.my.fctrScience#Health#
## 8.362226e-01
## NDSSName.my.fctrStyles##Fashion
## -3.079107e-02
## NDSSName.my.fctrStyles#U.S.#
## 6.586231e-01
## NDSSName.my.fctrTStyle##
## -9.956225e-02
## PubDate.day.minutes.poly.1
## 6.452334e+00
## PubDate.day.minutes.poly.2
## 3.185151e+00
## PubDate.day.minutes.poly.4
## 8.173930e-01
## PubDate.last4.log1p
## 4.098268e-03
## PubDate.wkday.fctr5
## -2.275236e-05
## PubDate.wkend
## 1.052728e-01
## WordCount.root2
## 2.981460e-02
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.1.ctg
## 1.144109e+00
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.2.ctg
## -4.058241e+00
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.2.ctg
## 1.435473e+00
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg
## 4.534495e+00
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.3.ctg
## 4.741264e-01
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.4.ctg
## 1.162763e-01
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.5.ctg
## 1.806309e+00
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last16.log1p.ctg
## -4.705401e-03
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last16.log1p.ctg
## 4.304598e-02
## NDSSName.my.fctr#U.S.#Education:PubDate.last16.log1p.ctg
## -3.317348e-03
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last16.log1p.ctg
## 4.698895e-02
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last16.log1p.ctg
## -2.205370e-03
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last16.log1p.ctg
## 4.593643e-02
## NDSSName.my.fctrScience#Health#:PubDate.last16.log1p.ctg
## 3.155326e-02
## NDSSName.my.fctrStyles##Fashion:PubDate.last16.log1p.ctg
## -4.428299e-04
## NDSSName.my.fctrStyles#U.S.#:PubDate.last16.log1p.ctg
## 2.398555e-02
## NDSSName.my.fctrTStyle##:PubDate.last16.log1p.ctg
## -6.710680e-03
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last2.log1p.ctg
## -7.172957e-03
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last2.log1p.ctg
## 3.276090e-02
## NDSSName.my.fctr#U.S.#Education:PubDate.last2.log1p.ctg
## -5.154022e-03
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last2.log1p.ctg
## 6.583764e-02
## NDSSName.my.fctrBusiness#Technology#:PubDate.last2.log1p.ctg
## 8.924455e-03
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last2.log1p.ctg
## -3.157626e-03
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last2.log1p.ctg
## 6.742134e-02
## NDSSName.my.fctrScience#Health#:PubDate.last2.log1p.ctg
## 5.375306e-02
## NDSSName.my.fctrStyles##Fashion:PubDate.last2.log1p.ctg
## -1.106633e-03
## NDSSName.my.fctrStyles#U.S.#:PubDate.last2.log1p.ctg
## 5.997036e-02
## NDSSName.my.fctrTStyle##:PubDate.last2.log1p.ctg
## -9.591957e-03
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last32.log1p.ctg
## -4.820085e-03
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last32.log1p.ctg
## 4.357947e-02
## NDSSName.my.fctr#U.S.#Education:PubDate.last32.log1p.ctg
## -2.077560e-03
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last32.log1p.ctg
## 4.049781e-02
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last32.log1p.ctg
## 3.962558e-02
## NDSSName.my.fctrScience#Health#:PubDate.last32.log1p.ctg
## 3.048215e-02
## NDSSName.my.fctrStyles#U.S.#:PubDate.last32.log1p.ctg
## 2.113852e-02
## NDSSName.my.fctrTStyle##:PubDate.last32.log1p.ctg
## -5.645530e-03
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last4.log1p.ctg
## -6.756004e-03
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last4.log1p.ctg
## 3.427222e-02
## NDSSName.my.fctr#U.S.#Education:PubDate.last4.log1p.ctg
## -4.871129e-03
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last4.log1p.ctg
## 6.144420e-02
## NDSSName.my.fctrBusiness#Technology#:PubDate.last4.log1p.ctg
## 5.221287e-03
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last4.log1p.ctg
## -3.128905e-03
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last4.log1p.ctg
## 5.939321e-02
## NDSSName.my.fctrScience#Health#:PubDate.last4.log1p.ctg
## 4.046602e-02
## NDSSName.my.fctrStyles##Fashion:PubDate.last4.log1p.ctg
## -9.613579e-04
## NDSSName.my.fctrStyles#U.S.#:PubDate.last4.log1p.ctg
## 5.354044e-02
## NDSSName.my.fctrTStyle##:PubDate.last4.log1p.ctg
## -8.318684e-03
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last8.log1p.ctg
## -6.377549e-03
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last8.log1p.ctg
## 4.012450e-02
## NDSSName.my.fctr#U.S.#Education:PubDate.last8.log1p.ctg
## -4.056166e-03
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last8.log1p.ctg
## 4.714690e-02
## NDSSName.my.fctrBusiness#Technology#:PubDate.last8.log1p.ctg
## 3.372173e-03
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last8.log1p.ctg
## -3.104981e-03
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last8.log1p.ctg
## 5.275033e-02
## NDSSName.my.fctrScience#Health#:PubDate.last8.log1p.ctg
## 3.856981e-02
## NDSSName.my.fctrStyles##Fashion:PubDate.last8.log1p.ctg
## -1.044166e-03
## NDSSName.my.fctrStyles#U.S.#:PubDate.last8.log1p.ctg
## 3.610233e-02
## NDSSName.my.fctrTStyle##:PubDate.last8.log1p.ctg
## -7.703998e-03
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.2917372206
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.1077991963
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 0.7145162057
## NDSSName.my.fctr#U.S.#Education
## -0.0675153576
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 0.6847086604
## NDSSName.my.fctrBusiness#Technology#
## 0.0338425465
## NDSSName.my.fctrForeign#World#
## -0.0056510270
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.0431336996
## NDSSName.my.fctrOpEd#Opinion#
## 0.8080156260
## NDSSName.my.fctrScience#Health#
## 0.8720969876
## NDSSName.my.fctrStyles##Fashion
## -0.0449467556
## NDSSName.my.fctrStyles#U.S.#
## 0.6848458465
## NDSSName.my.fctrTStyle##
## -0.1068542341
## PubDate.day.minutes.poly.1
## 6.9218320529
## PubDate.day.minutes.poly.2
## 3.7125649527
## PubDate.day.minutes.poly.4
## 1.0946423451
## PubDate.last4.log1p
## 0.0074154519
## PubDate.wkday.fctr5
## -0.0099640217
## PubDate.wkend
## 0.1191625256
## WordCount.root2
## 0.0317225870
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.1.ctg
## 1.4076395302
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.2.ctg
## -5.5235755281
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.2.ctg
## 1.6550615516
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg
## 5.3180890769
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.3.ctg
## 0.7422606550
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.4.ctg
## 0.4015516661
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.5.ctg
## 1.2519607267
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.5.ctg
## 0.0151914873
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.5.ctg
## 2.7388433211
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last16.log1p.ctg
## -0.0063971273
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last16.log1p.ctg
## 0.0444497010
## NDSSName.my.fctr#U.S.#Education:PubDate.last16.log1p.ctg
## -0.0040421728
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last16.log1p.ctg
## 0.0476287025
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last16.log1p.ctg
## -0.0030456278
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last16.log1p.ctg
## 0.0463442551
## NDSSName.my.fctrScience#Health#:PubDate.last16.log1p.ctg
## 0.0314477085
## NDSSName.my.fctrStyles##Fashion:PubDate.last16.log1p.ctg
## -0.0014930601
## NDSSName.my.fctrStyles#U.S.#:PubDate.last16.log1p.ctg
## 0.0234751275
## NDSSName.my.fctrTStyle##:PubDate.last16.log1p.ctg
## -0.0072231460
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last2.log1p.ctg
## -0.0093789969
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last2.log1p.ctg
## 0.0336400023
## NDSSName.my.fctr#U.S.#Education:PubDate.last2.log1p.ctg
## -0.0061272849
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last2.log1p.ctg
## 0.0671838538
## NDSSName.my.fctrBusiness#Technology#:PubDate.last2.log1p.ctg
## 0.0105210601
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last2.log1p.ctg
## -0.0042188810
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last2.log1p.ctg
## 0.0687814614
## NDSSName.my.fctrScience#Health#:PubDate.last2.log1p.ctg
## 0.0550257721
## NDSSName.my.fctrStyles##Fashion:PubDate.last2.log1p.ctg
## -0.0024258118
## NDSSName.my.fctrStyles#U.S.#:PubDate.last2.log1p.ctg
## 0.0620565904
## NDSSName.my.fctrTStyle##:PubDate.last2.log1p.ctg
## -0.0103290150
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last32.log1p.ctg
## -0.0064579863
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last32.log1p.ctg
## 0.0450789075
## NDSSName.my.fctr#U.S.#Education:PubDate.last32.log1p.ctg
## -0.0027461150
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last32.log1p.ctg
## 0.0408247292
## NDSSName.my.fctrBusiness#Technology#:PubDate.last32.log1p.ctg
## 0.0003621433
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last32.log1p.ctg
## -0.0005633660
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last32.log1p.ctg
## 0.0397403693
## NDSSName.my.fctrScience#Health#:PubDate.last32.log1p.ctg
## 0.0304028028
## NDSSName.my.fctrStyles##Fashion:PubDate.last32.log1p.ctg
## -0.0001455953
## NDSSName.my.fctrStyles#U.S.#:PubDate.last32.log1p.ctg
## 0.0205734403
## NDSSName.my.fctrTStyle##:PubDate.last32.log1p.ctg
## -0.0060725179
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last4.log1p.ctg
## -0.0087853392
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last4.log1p.ctg
## 0.0351829015
## NDSSName.my.fctr#U.S.#Education:PubDate.last4.log1p.ctg
## -0.0057617515
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last4.log1p.ctg
## 0.0627276287
## NDSSName.my.fctrBusiness#Technology#:PubDate.last4.log1p.ctg
## 0.0064552363
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last4.log1p.ctg
## -0.0041041644
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last4.log1p.ctg
## 0.0603096891
## NDSSName.my.fctrScience#Health#:PubDate.last4.log1p.ctg
## 0.0407244990
## NDSSName.my.fctrStyles##Fashion:PubDate.last4.log1p.ctg
## -0.0021050459
## NDSSName.my.fctrStyles#U.S.#:PubDate.last4.log1p.ctg
## 0.0551911420
## NDSSName.my.fctrTStyle##:PubDate.last4.log1p.ctg
## -0.0089469436
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.last8.log1p.ctg
## -0.0082606820
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.last8.log1p.ctg
## 0.0412828578
## NDSSName.my.fctr#U.S.#Education:PubDate.last8.log1p.ctg
## -0.0048320742
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.last8.log1p.ctg
## 0.0475545601
## NDSSName.my.fctrBusiness#Technology#:PubDate.last8.log1p.ctg
## 0.0044231483
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.last8.log1p.ctg
## -0.0040431528
## NDSSName.my.fctrOpEd#Opinion#:PubDate.last8.log1p.ctg
## 0.0534726687
## NDSSName.my.fctrScience#Health#:PubDate.last8.log1p.ctg
## 0.0389034575
## NDSSName.my.fctrStyles##Fashion:PubDate.last8.log1p.ctg
## -0.0021193073
## NDSSName.my.fctrStyles#U.S.#:PubDate.last8.log1p.ctg
## 0.0364380868
## NDSSName.my.fctrTStyle##:PubDate.last8.log1p.ctg
## -0.0082891522
## Prediction
## Reference N Y
## N 3787 154
## Y 174 689
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.317236e-01 7.662359e-01 9.242213e-01 9.386958e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 7.764099e-112 2.941323e-01
## Prediction
## Reference N Y
## N 1209 289
## Y 94 136
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.783565e-01 2.931798e-01 7.580195e-01 7.977437e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 3.657340e-23
## id
## 1 Low.cor.X##rcv#glmnet
## feats
## 1 WordCount.root2,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.last32.log1p.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.last16.log1p.ctg,NDSSName.my.fctr:PubDate.last2.log1p.ctg,NDSSName.my.fctr:PubDate.last4.log1p.ctg,NDSSName.my.fctr:PubDate.last8.log1p.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 5 77.491 4.864
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8624894 0.9659985 0.7589803 0.958864
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.8077374 0.9276303
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9242213 0.9386958 0.7453708
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5917252 0.9138852 0.2695652 0.8052766
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.4152672 0.7783565
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7580195 0.7977437 0.2931798
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.004942454 0.01832711
rm(ret_lst)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor label_minor bgn end elapsed
## 10 fit.models 6 0 0 74.756 350.285 275.529
## 11 fit.models 6 1 1 350.285 NA NA
fit.models_1_chunk_df <- myadd_chunk(NULL, "fit.models_1_bgn", label.minor="setup")
## label step_major step_minor label_minor bgn end elapsed
## 1 fit.models_1_bgn 1 0 setup 389.669 NA NA
#stop(here"); glb2Sav(); all.equal(glb_models_df, sav_models_df)
topindep_var <- NULL; interact_vars <- NULL;
for (mdl_id_pfx in names(glb_mdl_family_lst)) {
fit.models_1_chunk_df <-
myadd_chunk(fit.models_1_chunk_df, paste0("fit.models_1_", mdl_id_pfx),
major.inc = FALSE, label.minor = "setup")
indep_vars <- NULL;
if (grepl("\\.Interact", mdl_id_pfx)) {
if (is.null(topindep_var) && is.null(interact_vars)) {
# select best glmnet model upto now
dsp_models_df <- orderBy(model_sel_frmla <- get_model_sel_frmla(),
glb_models_df)
dsp_models_df <- subset(dsp_models_df,
grepl(".glmnet", id, fixed = TRUE))
bst_mdl_id <- dsp_models_df$id[1]
mdl_id_pfx <-
paste(c(head(unlist(strsplit(bst_mdl_id, "[.]")), -1), "Interact"),
collapse=".")
# select important features
if (is.null(bst_featsimp_df <-
myget_feats_importance(glb_models_lst[[bst_mdl_id]]))) {
warning("Base model for RFE.Interact: ", bst_mdl_id,
" has no important features")
next
}
topindep_ix <- 1
while (is.null(topindep_var) && (topindep_ix <= nrow(bst_featsimp_df))) {
topindep_var <- row.names(bst_featsimp_df)[topindep_ix]
if (grepl(".fctr", topindep_var, fixed=TRUE))
topindep_var <-
paste0(unlist(strsplit(topindep_var, ".fctr"))[1], ".fctr")
if (topindep_var %in% names(glbFeatsInteractionOnly)) {
topindep_var <- NULL; topindep_ix <- topindep_ix + 1
} else break
}
# select features with importance > max(10, importance of .rnorm) & is not highest
# combine factor dummy features to just the factor feature
if (length(pos_rnorm <-
grep(".rnorm", row.names(bst_featsimp_df), fixed=TRUE)) > 0)
imp_rnorm <- bst_featsimp_df[pos_rnorm, 1] else
imp_rnorm <- NA
imp_cutoff <- max(10, imp_rnorm, na.rm=TRUE)
interact_vars <-
tail(row.names(subset(bst_featsimp_df,
imp > imp_cutoff)), -1)
if (length(interact_vars) > 0) {
interact_vars <-
myadjust_interaction_feats(myextract_actual_feats(interact_vars))
interact_vars <-
interact_vars[!grepl(topindep_var, interact_vars, fixed=TRUE)]
}
### bid0_sp only
# interact_vars <- c(
# "biddable", "D.ratio.sum.TfIdf.wrds.n", "D.TfIdf.sum.stem.stop.Ratio", "D.sum.TfIdf",
# "D.TfIdf.sum.post.stop", "D.TfIdf.sum.post.stem", "D.ratio.wrds.stop.n.wrds.n", "D.chrs.uppr.n.log",
# "D.chrs.n.log", "color.fctr"
# # , "condition.fctr", "prdl.my.descr.fctr"
# )
# interact_vars <- setdiff(interact_vars, c("startprice.dgt2.is9", "color.fctr"))
###
indep_vars <- myextract_actual_feats(row.names(bst_featsimp_df))
indep_vars <- setdiff(indep_vars, topindep_var)
if (length(interact_vars) > 0) {
indep_vars <-
setdiff(indep_vars, myextract_actual_feats(interact_vars))
indep_vars <- c(indep_vars,
paste(topindep_var, setdiff(interact_vars, topindep_var),
sep = "*"))
} else indep_vars <- union(indep_vars, topindep_var)
}
}
if (is.null(indep_vars))
indep_vars <- glb_mdl_feats_lst[[mdl_id_pfx]]
if (is.null(indep_vars) && grepl("RFE\\.", mdl_id_pfx))
indep_vars <- myextract_actual_feats(predictors(rfe_fit_results))
if (is.null(indep_vars))
indep_vars <- subset(glb_feats_df, !nzv & (exclude.as.feat != 1))[, "id"]
if (grepl("^%<d-%", indep_vars)) {
#stop(here")
indep_vars <-
eval(parse(text = str_trim(unlist(strsplit(indep_vars, "%<d-%"))[2])))
}
indep_vars <- myadjust_interaction_feats(indep_vars)
if (grepl("\\.Interact", mdl_id_pfx)) {
# if (method != tail(unlist(strsplit(bst_mdl_id, "[.]")), 1)) next
if (is.null(glb_mdl_family_lst[[mdl_id_pfx]])) {
if (!is.null(glb_mdl_family_lst[["Best.Interact"]]))
glb_mdl_family_lst[[mdl_id_pfx]] <-
glb_mdl_family_lst[["Best.Interact"]]
}
}
if (!is.null(glbObsFitOutliers[[mdl_id_pfx]])) {
fitobs_df <- glbObsFit[!(glbObsFit[, glb_id_var] %in%
glbObsFitOutliers[[mdl_id_pfx]]), ]
} else fitobs_df <- glbObsFit
if (is.null(glb_mdl_family_lst[[mdl_id_pfx]]))
mdl_methods <- glbMdlMethods else
mdl_methods <- glb_mdl_family_lst[[mdl_id_pfx]]
for (method in mdl_methods) {
if (method %in% c("rpart", "rf")) {
# rpart: fubar's the tree
# rf: skip the scenario w/ .rnorm for speed
indep_vars <- setdiff(indep_vars, c(".rnorm"))
#mdl_id <- paste0(mdl_id_pfx, ".no.rnorm")
}
fit.models_1_chunk_df <- myadd_chunk(fit.models_1_chunk_df,
paste0("fit.models_1_", mdl_id_pfx), major.inc = FALSE,
label.minor = method)
# The last([[:digit:]]+)(.*)\\.ctg feats are taking a long time for this experiment
indep_vars <- indep_vars[!grepl("\\.last([[:digit:]]+)(.*)\\.ctg", indep_vars)]
ret_lst <-
myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = mdl_id_pfx,
type = glb_model_type,
tune.df =
if ((mdl_id_pfx %in% "All.X") && (method %in% "glmnet")) glmnet_tune_models_df else
glb_tune_models_df,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
#trainControl.allowParallel = FALSE,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = method)),
indep_vars = indep_vars, rsp_var = glb_rsp_var,
fit_df = fitobs_df, OOB_df = glbObsOOB)
}
}
## label step_major step_minor label_minor bgn end
## 1 fit.models_1_bgn 1 0 setup 389.669 389.679
## 2 fit.models_1_All.X 1 1 setup 389.680 NA
## elapsed
## 1 0.01
## 2 NA
## Warning in if (grepl("^%<d-%", indep_vars)) {: the condition has length > 1
## and only the first element will be used
## label step_major step_minor label_minor bgn end
## 2 fit.models_1_All.X 1 1 setup 389.680 389.689
## 3 fit.models_1_All.X 1 2 glmnet 389.689 NA
## elapsed
## 2 0.009
## 3 NA
## [1] "fitting model: All.X##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,WordCount.log1p,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.hour.fctr,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last2.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,PubDate.juliandate,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0934 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: alpha
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 100 -none- numeric
## beta 16200 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 162 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.70728068
## NDSSName.my.fctr#Multimedia#
## -0.03057558
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.60940339
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 1.94290987
## NDSSName.my.fctr#U.S.#Education
## -0.26695651
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.16586946
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.12932205
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 2.23109377
## NDSSName.my.fctrCulture#Arts#
## -0.17319452
## NDSSName.my.fctrForeign#World#
## -0.14135573
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.30346668
## NDSSName.my.fctrOpEd#Opinion#
## 2.48538732
## NDSSName.my.fctrScience#Health#
## 1.99127252
## NDSSName.my.fctrStyles##Fashion
## -0.25250399
## NDSSName.my.fctrStyles#U.S.#
## 1.82412410
## NDSSName.my.fctrTStyle##
## -0.42090722
## NDSSName.my.fctrTravel#Travel#
## -0.11316375
## PubDate.day.minutes.poly.1
## 9.79096645
## PubDate.day.minutes.poly.2
## 1.79653308
## PubDate.day.minutes.poly.4
## 3.98793812
## PubDate.hour.fctr(15.3,23]
## 0.03978968
## PubDate.last2.log1p
## 0.01083809
## PubDate.last4.log1p
## 0.01693989
## PubDate.wkend
## 0.15569751
## WordCount.log1p
## 0.14959363
## WordCount.root2
## 0.02370264
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.1.ctg
## -0.59811229
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.1.ctg
## 0.46880580
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.2.ctg
## -3.15153855
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg
## 4.31724063
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.3.ctg
## 0.70315244
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.4.ctg
## 0.44783900
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.5.ctg
## 1.13558865
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.832271960
## NDSSName.my.fctr#Multimedia#
## -0.057362968
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.690208382
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 2.048782794
## NDSSName.my.fctr#U.S.#Education
## -0.296522166
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.177201520
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.157735256
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 2.308574637
## NDSSName.my.fctrCulture#Arts#
## -0.187159278
## NDSSName.my.fctrForeign#World#
## -0.169354647
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.340667898
## NDSSName.my.fctrOpEd#Opinion#
## 2.572424625
## NDSSName.my.fctrScience#Health#
## 2.067845584
## NDSSName.my.fctrStyles##Fashion
## -0.291175749
## NDSSName.my.fctrStyles#U.S.#
## 1.900531124
## NDSSName.my.fctrTStyle##
## -0.447496009
## NDSSName.my.fctrTravel#Travel#
## -0.145402163
## NDSSName.my.fctrmyOther
## -0.020947092
## PubDate.day.minutes.poly.1
## 10.186502096
## PubDate.day.minutes.poly.2
## 2.103810556
## PubDate.day.minutes.poly.4
## 4.337265373
## PubDate.hour.fctr(15.3,23]
## 0.041843553
## PubDate.last2.log1p
## 0.012487863
## PubDate.last4.log1p
## 0.018581475
## PubDate.last8.log1p
## 0.001363957
## PubDate.wkend
## 0.166423165
## WordCount.log1p
## 0.156206169
## WordCount.root2
## 0.024718690
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.1.ctg
## -0.765331026
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.1.ctg
## 0.719908949
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.2.ctg
## -4.452382355
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.2.ctg
## 0.043369062
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg
## 5.091693780
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.3.ctg
## 0.851577628
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.4.ctg
## 0.693318411
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.5.ctg
## 1.932426590
## Prediction
## Reference N Y
## N 3790 151
## Y 177 686
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.317236e-01 7.655936e-01 9.242213e-01 9.386958e-01 8.203580e-01
## AccuracyPValue McnemarPValue
## 7.764099e-112 1.674653e-01
## Prediction
## Reference N Y
## N 874 624
## Y 25 205
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.244213e-01 2.258300e-01 6.011030e-01 6.473162e-01 8.668981e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 7.599305e-122
## id
## 1 All.X##rcv#glmnet
## feats
## 1 WordCount.root2,WordCount.log1p,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.hour.fctr,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last2.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,PubDate.juliandate,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 5 17.518 1.574
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8371661 0.9779244 0.6964079 0.9604952
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.3 0.8070588 0.9233293
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9242213 0.9386958 0.7141966
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5873513 0.9399199 0.2347826 0.8129999
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.3871577 0.6244213
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.601103 0.6473162 0.22583
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.006254044 0.02777844
# Check if other preProcess methods improve model performance
fit.models_1_chunk_df <-
myadd_chunk(fit.models_1_chunk_df, "fit.models_1_preProc", major.inc = FALSE,
label.minor = "preProc")
## label step_major step_minor label_minor bgn end
## 3 fit.models_1_All.X 1 2 glmnet 389.689 416.481
## 4 fit.models_1_preProc 1 3 preProc 416.482 NA
## elapsed
## 3 26.792
## 4 NA
mdl_id <- orderBy(get_model_sel_frmla(), glb_models_df)[1, "id"]
indep_vars_vctr <- trim(unlist(strsplit(glb_models_df[glb_models_df$id == mdl_id,
"feats"], "[,]")))
method <- tail(unlist(strsplit(mdl_id, "[.]")), 1)
mdl_id_pfx <- paste0(head(unlist(strsplit(mdl_id, "[.]")), -1), collapse = ".")
if (!is.null(glbObsFitOutliers[[mdl_id_pfx]])) {
fitobs_df <- glbObsFit[!(glbObsFit[, glb_id_var] %in%
glbObsFitOutliers[[mdl_id_pfx]]), ]
} else fitobs_df <- glbObsFit
for (prePr in glb_preproc_methods) {
# The operations are applied in this order:
# Box-Cox/Yeo-Johnson transformation, centering, scaling, range, imputation, PCA, ICA then spatial sign.
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix=mdl_id_pfx,
type=glb_model_type, tune.df=glb_tune_models_df,
trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds,
trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method=method, train.preProcess=prePr)),
indep_vars=indep_vars_vctr, rsp_var=glb_rsp_var,
fit_df=fitobs_df, OOB_df=glbObsOOB)
}
# If (All|RFE).X.glm is less accurate than Low.Cor.X.glm
# check NA coefficients & filter appropriate terms in indep_vars_vctr
# if (method == "glm") {
# orig_glm <- glb_models_lst[[paste0(mdl_id, ".", model_method)]]$finalModel
# orig_glm <- glb_models_lst[["All.X.glm"]]$finalModel; print(summary(orig_glm))
# orig_glm <- glb_models_lst[["RFE.X.glm"]]$finalModel; print(summary(orig_glm))
# require(car)
# vif_orig_glm <- vif(orig_glm); print(vif_orig_glm)
# # if vif errors out with "there are aliased coefficients in the model"
# alias_orig_glm <- alias(orig_glm); alias_complete_orig_glm <- (alias_orig_glm$Complete > 0); alias_complete_orig_glm <- alias_complete_orig_glm[rowSums(alias_complete_orig_glm) > 0, colSums(alias_complete_orig_glm) > 0]; print(alias_complete_orig_glm)
# print(vif_orig_glm[!is.na(vif_orig_glm) & (vif_orig_glm == Inf)])
# print(which.max(vif_orig_glm))
# print(sort(vif_orig_glm[vif_orig_glm >= 1.0e+03], decreasing=TRUE))
# glbObsFit[c(1143, 3637, 3953, 4105), c("UniqueID", "Popular", "H.P.quandary", "Headline")]
# glb_feats_df[glb_feats_df$id %in% grep("[HSA]\\.chrs.n.log", glb_feats_df$id, value=TRUE) | glb_feats_df$cor.high.X %in% grep("[HSA]\\.chrs.n.log", glb_feats_df$id, value=TRUE), ]
# all.equal(glbObsAll$S.chrs.uppr.n.log, glbObsAll$A.chrs.uppr.n.log)
# cor(glbObsAll$S.T.herald, glbObsAll$S.T.tribun)
# mydspObs(Abstract.contains="[Dd]iar", cols=("Abstract"), all=TRUE)
# subset(glb_feats_df, cor.y.abs <= glb_feats_df[glb_feats_df$id == ".rnorm", "cor.y.abs"])
# corxx_mtrx <- cor(data.matrix(glbObsAll[, setdiff(names(glbObsAll), myfind_chr_cols_df(glbObsAll))]), use="pairwise.complete.obs"); abs_corxx_mtrx <- abs(corxx_mtrx); diag(abs_corxx_mtrx) <- 0
# which.max(abs_corxx_mtrx["S.T.tribun", ])
# abs_corxx_mtrx["A.npnct08.log", "S.npnct08.log"]
# step_glm <- step(orig_glm)
# }
# Since caret does not optimize rpart well
# if (method == "rpart")
# ret_lst <- myfit_mdl(mdl_id=paste0(mdl_id_pfx, ".cp.0"), model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# model_type=glb_model_type,
# rsp_var=glb_rsp_var,
# fit_df=glbObsFit, OOB_df=glbObsOOB,
# n_cv_folds=0, tune_models_df=data.frame(parameter="cp", min=0.0, max=0.0, by=0.1))
# User specified
# Ensure at least 2 vars in each regression; else varImp crashes
# sav_models_lst <- glb_models_lst; sav_models_df <- glb_models_df; sav_featsimp_df <- glb_featsimp_df; all.equal(sav_featsimp_df, glb_featsimp_df)
# glb_models_lst <- sav_models_lst; glb_models_df <- sav_models_df; glm_featsimp_df <- sav_featsimp_df
# easier to exclude features
# require(gdata) # needed for trim
# mdl_id <- "";
# indep_vars_vctr <- head(subset(glb_models_df, grepl("All\\.X\\.", mdl_id), select=feats)
# , 1)[, "feats"]
# indep_vars_vctr <- trim(unlist(strsplit(indep_vars_vctr, "[,]")))
# indep_vars_vctr <- setdiff(indep_vars_vctr, ".rnorm")
# easier to include features
#stop(here"); sav_models_df <- glb_models_df; glb_models_df <- sav_models_df
# !_sp
# mdl_id <- "csm"; indep_vars_vctr <- c(NULL
# ,"prdline.my.fctr", "prdline.my.fctr:.clusterid.fctr"
# ,"prdline.my.fctr*biddable"
# #,"prdline.my.fctr*startprice.log"
# #,"prdline.my.fctr*startprice.diff"
# ,"prdline.my.fctr*condition.fctr"
# ,"prdline.my.fctr*D.terms.post.stop.n"
# #,"prdline.my.fctr*D.terms.post.stem.n"
# ,"prdline.my.fctr*cellular.fctr"
# # ,"<feat1>:<feat2>"
# )
# for (method in glbMdlMethods) {
# ret_lst <- myfit_mdl(mdl_id=mdl_id, model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# model_type=glb_model_type,
# rsp_var=glb_rsp_var,
# fit_df=glbObsFit, OOB_df=glbObsOOB,
# n_cv_folds=glb_rcv_n_folds, tune_models_df=glb_tune_models_df)
# csm_mdl_id <- paste0(mdl_id, ".", method)
# csm_featsimp_df <- myget_feats_importance(glb_models_lst[[paste0(mdl_id, ".",
# method)]]); print(head(csm_featsimp_df))
# }
###
# Ntv.1.lm <- lm(reformulate(indep_vars_vctr, glb_rsp_var), glbObsTrn); print(summary(Ntv.1.lm))
#glb_models_df[, "max.Accuracy.OOB", FALSE]
#varImp(glb_models_lst[["Low.cor.X.glm"]])
#orderBy(~ -Overall, varImp(glb_models_lst[["All.X.2.glm"]])$imp)
#orderBy(~ -Overall, varImp(glb_models_lst[["All.X.3.glm"]])$imp)
#glb_feats_df[grepl("npnct28", glb_feats_df$id), ]
# User specified bivariate models
# indep_vars_vctr_lst <- list()
# for (feat in setdiff(names(glbObsFit),
# union(glb_rsp_var, glbFeatsExclude)))
# indep_vars_vctr_lst[["feat"]] <- feat
# User specified combinatorial models
# indep_vars_vctr_lst <- list()
# combn_mtrx <- combn(c("<feat1_name>", "<feat2_name>", "<featn_name>"),
# <num_feats_to_choose>)
# for (combn_ix in 1:ncol(combn_mtrx))
# #print(combn_mtrx[, combn_ix])
# indep_vars_vctr_lst[[combn_ix]] <- combn_mtrx[, combn_ix]
# template for myfit_mdl
# rf is hard-coded in caret to recognize only Accuracy / Kappa evaluation metrics
# only for OOB in trainControl ?
# ret_lst <- myfit_mdl_fn(mdl_id=paste0(mdl_id_pfx, ""), model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# rsp_var=glb_rsp_var,
# fit_df=glbObsFit, OOB_df=glbObsOOB,
# n_cv_folds=glb_rcv_n_folds, tune_models_df=glb_tune_models_df,
# model_loss_mtrx=glbMdlMetric_terms,
# model_summaryFunction=glbMdlMetricSummaryFn,
# model_metric=glbMdlMetricSummary,
# model_metric_maximize=glbMdlMetricMaximize)
# Simplify a model
# fit_df <- glbObsFit; glb_mdl <- step(<complex>_mdl)
# Non-caret models
# rpart_area_mdl <- rpart(reformulate("Area", response=glb_rsp_var),
# data=glbObsFit, #method="class",
# control=rpart.control(cp=0.12),
# parms=list(loss=glbMdlMetric_terms))
# print("rpart_sel_wlm_mdl"); prp(rpart_sel_wlm_mdl)
#
print(glb_models_df)
## id
## MFO###myMFO_classfr MFO###myMFO_classfr
## Random###myrandom_classfr Random###myrandom_classfr
## Max.cor.Y.rcv.1X1###glmnet Max.cor.Y.rcv.1X1###glmnet
## Max.cor.Y.rcv.3X1##rcv#glmnet Max.cor.Y.rcv.3X1##rcv#glmnet
## Max.cor.Y.rcv.3X3##rcv#glmnet Max.cor.Y.rcv.3X3##rcv#glmnet
## Max.cor.Y.rcv.3X5##rcv#glmnet Max.cor.Y.rcv.3X5##rcv#glmnet
## Max.cor.Y.rcv.5X1##rcv#glmnet Max.cor.Y.rcv.5X1##rcv#glmnet
## Max.cor.Y.rcv.5X3##rcv#glmnet Max.cor.Y.rcv.5X3##rcv#glmnet
## Max.cor.Y.rcv.5X5##rcv#glmnet Max.cor.Y.rcv.5X5##rcv#glmnet
## Max.cor.Y.rcv.1X1.cp.0###rpart Max.cor.Y.rcv.1X1.cp.0###rpart
## Max.cor.Y##rcv#rpart Max.cor.Y##rcv#rpart
## Max.cor.Y.Time.Poly##rcv#glmnet Max.cor.Y.Time.Poly##rcv#glmnet
## Max.cor.Y.Time.Lag##rcv#glmnet Max.cor.Y.Time.Lag##rcv#glmnet
## Interact.High.cor.Y##rcv#glmnet Interact.High.cor.Y##rcv#glmnet
## Low.cor.X##rcv#glmnet Low.cor.X##rcv#glmnet
## All.X##rcv#glmnet All.X##rcv#glmnet
## feats
## MFO###myMFO_classfr .rnorm
## Random###myrandom_classfr .rnorm
## Max.cor.Y.rcv.1X1###glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.3X1##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.3X3##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.3X5##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.5X1##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.5X3##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.5X5##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.1X1.cp.0###rpart WordCount.root2,NDSSName.my.fctr
## Max.cor.Y##rcv#rpart WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.Time.Poly##rcv#glmnet WordCount.root2,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.day.minutes.poly.2,PubDate.day.minutes.poly.3,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg
## Max.cor.Y.Time.Lag##rcv#glmnet WordCount.root2,NDSSName.my.fctr,PubDate.last2.log1p,PubDate.last4.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.last32.log1p,NDSSName.my.fctr:PubDate.last32.log1p.ctg,NDSSName.my.fctr:PubDate.last16.log1p.ctg,NDSSName.my.fctr:PubDate.last2.log1p.ctg,NDSSName.my.fctr:PubDate.last4.log1p.ctg,NDSSName.my.fctr:PubDate.last8.log1p.ctg
## Interact.High.cor.Y##rcv#glmnet WordCount.root2,NDSSName.my.fctr,WordCount.root2:WordCount.root2,WordCount.root2:PubDate.day.minutes.poly.1,WordCount.root2:PubDate.last4.log1p,WordCount.root2:PubDate.month.fctr
## Low.cor.X##rcv#glmnet WordCount.root2,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.last32.log1p.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.last16.log1p.ctg,NDSSName.my.fctr:PubDate.last2.log1p.ctg,NDSSName.my.fctr:PubDate.last4.log1p.ctg,NDSSName.my.fctr:PubDate.last8.log1p.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg
## All.X##rcv#glmnet WordCount.root2,WordCount.log1p,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.hour.fctr,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last2.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,PubDate.juliandate,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg
## max.nTuningRuns min.elapsedtime.everything
## MFO###myMFO_classfr 0 0.296
## Random###myrandom_classfr 0 0.301
## Max.cor.Y.rcv.1X1###glmnet 0 1.051
## Max.cor.Y.rcv.3X1##rcv#glmnet 25 2.538
## Max.cor.Y.rcv.3X3##rcv#glmnet 25 4.667
## Max.cor.Y.rcv.3X5##rcv#glmnet 25 7.072
## Max.cor.Y.rcv.5X1##rcv#glmnet 25 3.530
## Max.cor.Y.rcv.5X3##rcv#glmnet 25 7.415
## Max.cor.Y.rcv.5X5##rcv#glmnet 25 9.956
## Max.cor.Y.rcv.1X1.cp.0###rpart 0 0.994
## Max.cor.Y##rcv#rpart 5 3.519
## Max.cor.Y.Time.Poly##rcv#glmnet 25 13.151
## Max.cor.Y.Time.Lag##rcv#glmnet 5 42.176
## Interact.High.cor.Y##rcv#glmnet 25 5.404
## Low.cor.X##rcv#glmnet 5 77.491
## All.X##rcv#glmnet 5 17.518
## min.elapsedtime.final max.AUCpROC.fit
## MFO###myMFO_classfr 0.004 0.5000000
## Random###myrandom_classfr 0.002 0.4990604
## Max.cor.Y.rcv.1X1###glmnet 0.279 0.8790544
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.273 0.8767919
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.274 0.8767919
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.276 0.8767919
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.277 0.8784031
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.285 0.8784031
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.275 0.8784031
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.076 0.8821543
## Max.cor.Y##rcv#rpart 0.074 0.8709432
## Max.cor.Y.Time.Poly##rcv#glmnet 1.697 0.8734975
## Max.cor.Y.Time.Lag##rcv#glmnet 2.825 0.8621088
## Interact.High.cor.Y##rcv#glmnet 0.342 0.8776419
## Low.cor.X##rcv#glmnet 4.864 0.8624894
## All.X##rcv#glmnet 1.574 0.8371661
## max.Sens.fit max.Spec.fit max.AUCROCR.fit
## MFO###myMFO_classfr 1.0000000 0.0000000 0.5000000
## Random###myrandom_classfr 0.8312611 0.1668598 0.4972757
## Max.cor.Y.rcv.1X1###glmnet 0.9632073 0.7949015 0.9608594
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.9644760 0.7891078 0.9582555
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.9644760 0.7891078 0.9582555
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.9644760 0.7891078 0.9582555
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.9642223 0.7925840 0.9607052
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.9642223 0.7925840 0.9607052
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.9642223 0.7925840 0.9607052
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.9705658 0.7937428 0.9504198
## Max.cor.Y##rcv#rpart 0.9632073 0.7786790 0.8746354
## Max.cor.Y.Time.Poly##rcv#glmnet 0.9659985 0.7809965 0.9534659
## Max.cor.Y.Time.Lag##rcv#glmnet 0.9652372 0.7589803 0.9558908
## Interact.High.cor.Y##rcv#glmnet 0.9626998 0.7925840 0.9625372
## Low.cor.X##rcv#glmnet 0.9659985 0.7589803 0.9588640
## All.X##rcv#glmnet 0.9779244 0.6964079 0.9604952
## opt.prob.threshold.fit max.f.score.fit
## MFO###myMFO_classfr 0.1 0.3045703
## Random###myrandom_classfr 0.1 0.3045703
## Max.cor.Y.rcv.1X1###glmnet 0.5 0.8099174
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.4 0.8099174
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.4 0.8099174
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.4 0.8099174
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.5 0.8104265
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.5 0.8104265
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.5 0.8104265
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.4 0.8235294
## Max.cor.Y##rcv#rpart 0.6 0.8000000
## Max.cor.Y.Time.Poly##rcv#glmnet 0.4 0.8103957
## Max.cor.Y.Time.Lag##rcv#glmnet 0.2 0.8075338
## Interact.High.cor.Y##rcv#glmnet 0.4 0.8084359
## Low.cor.X##rcv#glmnet 0.2 0.8077374
## All.X##rcv#glmnet 0.3 0.8070588
## max.Accuracy.fit max.AccuracyLower.fit
## MFO###myMFO_classfr 0.1796420 0.1688795
## Random###myrandom_classfr 0.1796420 0.1688795
## Max.cor.Y.rcv.1X1###glmnet 0.9329725 0.9255302
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.9335973 0.9255302
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.9333193 0.9255302
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.9332218 0.9255302
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.9331818 0.9259666
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.9333905 0.9259666
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.9331816 0.9259666
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.9381765 0.9309917
## Max.cor.Y##rcv#rpart 0.9296422 0.9224771
## Max.cor.Y.Time.Poly##rcv#glmnet 0.9319320 0.9257484
## Max.cor.Y.Time.Lag##rcv#glmnet 0.9279769 0.9244394
## Interact.High.cor.Y##rcv#glmnet 0.9315850 0.9244394
## Low.cor.X##rcv#glmnet 0.9276303 0.9242213
## All.X##rcv#glmnet 0.9233293 0.9242213
## max.AccuracyUpper.fit max.Kappa.fit
## MFO###myMFO_classfr 0.1907952 0.0000000
## Random###myrandom_classfr 0.1907952 0.0000000
## Max.cor.Y.rcv.1X1###glmnet 0.9398832 0.7692476
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.9398832 0.7691678
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.9398832 0.7690803
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.9398832 0.7686375
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.9402789 0.7689055
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.9402789 0.7698577
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.9402789 0.7691429
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.9448229 0.7860827
## Max.cor.Y##rcv#rpart 0.9371115 0.7515134
## Max.cor.Y.Time.Poly##rcv#glmnet 0.9400811 0.7628290
## Max.cor.Y.Time.Lag##rcv#glmnet 0.9388938 0.7473218
## Interact.High.cor.Y##rcv#glmnet 0.9388938 0.7641040
## Low.cor.X##rcv#glmnet 0.9386958 0.7453708
## All.X##rcv#glmnet 0.9386958 0.7141966
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB
## MFO###myMFO_classfr 0.5000000 1.0000000 0.0000000
## Random###myrandom_classfr 0.5125675 0.8077437 0.2173913
## Max.cor.Y.rcv.1X1###glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.6174697 0.9218959 0.3130435
## Max.cor.Y##rcv#rpart 0.5870523 0.9045394 0.2695652
## Max.cor.Y.Time.Poly##rcv#glmnet 0.5950717 0.9118825 0.2782609
## Max.cor.Y.Time.Lag##rcv#glmnet 0.5927265 0.9158879 0.2695652
## Interact.High.cor.Y##rcv#glmnet 0.6009259 0.9105474 0.2913043
## Low.cor.X##rcv#glmnet 0.5917252 0.9138852 0.2695652
## All.X##rcv#glmnet 0.5873513 0.9399199 0.2347826
## max.AUCROCR.OOB opt.prob.threshold.OOB
## MFO###myMFO_classfr 0.5000000 0.1
## Random###myrandom_classfr 0.4857956 0.1
## Max.cor.Y.rcv.1X1###glmnet 0.8116126 0.1
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.8067975 0.1
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.8067975 0.1
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.8067975 0.1
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.8114863 0.1
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.8114863 0.1
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.8114863 0.1
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.7773858 0.1
## Max.cor.Y##rcv#rpart 0.5892132 0.6
## Max.cor.Y.Time.Poly##rcv#glmnet 0.7997373 0.1
## Max.cor.Y.Time.Lag##rcv#glmnet 0.8024758 0.1
## Interact.High.cor.Y##rcv#glmnet 0.8140971 0.1
## Low.cor.X##rcv#glmnet 0.8052766 0.1
## All.X##rcv#glmnet 0.8129999 0.1
## max.f.score.OOB max.Accuracy.OOB
## MFO###myMFO_classfr 0.2349336 0.1331019
## Random###myrandom_classfr 0.2349336 0.1331019
## Max.cor.Y.rcv.1X1###glmnet 0.4405405 0.7604167
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.4375839 0.7575231
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.4375839 0.7575231
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.4375839 0.7575231
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.4609375 0.7604167
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.4609375 0.7604167
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.4609375 0.7604167
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.4207493 0.7673611
## Max.cor.Y##rcv#rpart 0.2850575 0.8200231
## Max.cor.Y.Time.Poly##rcv#glmnet 0.4191617 0.7754630
## Max.cor.Y.Time.Lag##rcv#glmnet 0.4118565 0.7818287
## Interact.High.cor.Y##rcv#glmnet 0.4398340 0.7656250
## Low.cor.X##rcv#glmnet 0.4152672 0.7783565
## All.X##rcv#glmnet 0.3871577 0.6244213
## max.AccuracyLower.OOB
## MFO###myMFO_classfr 0.1174298
## Random###myrandom_classfr 0.1174298
## Max.cor.Y.rcv.1X1###glmnet 0.7395703
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.7365992
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.7365992
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.7365992
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.7395703
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.7395703
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.7395703
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.7467059
## Max.cor.Y##rcv#rpart 0.8010821
## Max.cor.Y.Time.Poly##rcv#glmnet 0.7550404
## Max.cor.Y.Time.Lag##rcv#glmnet 0.7615963
## Interact.High.cor.Y##rcv#glmnet 0.7449213
## Low.cor.X##rcv#glmnet 0.7580195
## All.X##rcv#glmnet 0.6011030
## max.AccuracyUpper.OOB max.Kappa.OOB
## MFO###myMFO_classfr 0.1500310 0.0000000
## Random###myrandom_classfr 0.1500310 0.0000000
## Max.cor.Y.rcv.1X1###glmnet 0.7803749 0.3148374
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.7775689 0.3107477
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.7775689 0.3107477
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.7775689 0.3107477
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.7803749 0.3373693
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.7803749 0.3373693
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.7803749 0.3373693
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.7871043 0.2953321
## Max.cor.Y##rcv#rpart 0.8378705 0.1825002
## Max.cor.Y.Time.Poly##rcv#glmnet 0.7949457 0.2963401
## Max.cor.Y.Time.Lag##rcv#glmnet 0.8010994 0.2908072
## Interact.High.cor.Y##rcv#glmnet 0.7854227 0.3156027
## Low.cor.X##rcv#glmnet 0.7977437 0.2931798
## All.X##rcv#glmnet 0.6473162 0.2258300
## max.AccuracySD.fit max.KappaSD.fit
## MFO###myMFO_classfr NA NA
## Random###myrandom_classfr NA NA
## Max.cor.Y.rcv.1X1###glmnet NA NA
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.007015493 0.02403706
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.005178375 0.01754365
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.005396525 0.01835474
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.008837283 0.03133449
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.006138477 0.02161286
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.006213800 0.02210061
## Max.cor.Y.rcv.1X1.cp.0###rpart NA NA
## Max.cor.Y##rcv#rpart 0.005069520 0.01910910
## Max.cor.Y.Time.Poly##rcv#glmnet 0.005321823 0.01901796
## Max.cor.Y.Time.Lag##rcv#glmnet 0.004678817 0.01649089
## Interact.High.cor.Y##rcv#glmnet 0.005250654 0.01810996
## Low.cor.X##rcv#glmnet 0.004942454 0.01832711
## All.X##rcv#glmnet 0.006254044 0.02777844
rm(ret_lst)
fit.models_1_chunk_df <-
myadd_chunk(fit.models_1_chunk_df, "fit.models_1_end", major.inc = FALSE,
label.minor = "teardown")
## label step_major step_minor label_minor bgn end
## 4 fit.models_1_preProc 1 3 preProc 416.482 416.555
## 5 fit.models_1_end 1 4 teardown 416.555 NA
## elapsed
## 4 0.073
## 5 NA
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc = FALSE)
## label step_major step_minor label_minor bgn end elapsed
## 11 fit.models 6 1 1 350.285 416.564 66.279
## 12 fit.models 6 2 2 416.565 NA NA
fit.models_2_chunk_df <-
myadd_chunk(NULL, "fit.models_2_bgn", label.minor = "setup")
## label step_major step_minor label_minor bgn end elapsed
## 1 fit.models_2_bgn 1 0 setup 420.696 NA NA
plt_models_df <- glb_models_df[, -grep("SD|Upper|Lower", names(glb_models_df))]
for (var in grep("^min.", names(plt_models_df), value=TRUE)) {
plt_models_df[, sub("min.", "inv.", var)] <-
#ifelse(all(is.na(tmp <- plt_models_df[, var])), NA, 1.0 / tmp)
1.0 / plt_models_df[, var]
plt_models_df <- plt_models_df[ , -grep(var, names(plt_models_df))]
}
print(plt_models_df)
## id
## MFO###myMFO_classfr MFO###myMFO_classfr
## Random###myrandom_classfr Random###myrandom_classfr
## Max.cor.Y.rcv.1X1###glmnet Max.cor.Y.rcv.1X1###glmnet
## Max.cor.Y.rcv.3X1##rcv#glmnet Max.cor.Y.rcv.3X1##rcv#glmnet
## Max.cor.Y.rcv.3X3##rcv#glmnet Max.cor.Y.rcv.3X3##rcv#glmnet
## Max.cor.Y.rcv.3X5##rcv#glmnet Max.cor.Y.rcv.3X5##rcv#glmnet
## Max.cor.Y.rcv.5X1##rcv#glmnet Max.cor.Y.rcv.5X1##rcv#glmnet
## Max.cor.Y.rcv.5X3##rcv#glmnet Max.cor.Y.rcv.5X3##rcv#glmnet
## Max.cor.Y.rcv.5X5##rcv#glmnet Max.cor.Y.rcv.5X5##rcv#glmnet
## Max.cor.Y.rcv.1X1.cp.0###rpart Max.cor.Y.rcv.1X1.cp.0###rpart
## Max.cor.Y##rcv#rpart Max.cor.Y##rcv#rpart
## Max.cor.Y.Time.Poly##rcv#glmnet Max.cor.Y.Time.Poly##rcv#glmnet
## Max.cor.Y.Time.Lag##rcv#glmnet Max.cor.Y.Time.Lag##rcv#glmnet
## Interact.High.cor.Y##rcv#glmnet Interact.High.cor.Y##rcv#glmnet
## Low.cor.X##rcv#glmnet Low.cor.X##rcv#glmnet
## All.X##rcv#glmnet All.X##rcv#glmnet
## feats
## MFO###myMFO_classfr .rnorm
## Random###myrandom_classfr .rnorm
## Max.cor.Y.rcv.1X1###glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.3X1##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.3X3##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.3X5##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.5X1##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.5X3##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.5X5##rcv#glmnet WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.rcv.1X1.cp.0###rpart WordCount.root2,NDSSName.my.fctr
## Max.cor.Y##rcv#rpart WordCount.root2,NDSSName.my.fctr
## Max.cor.Y.Time.Poly##rcv#glmnet WordCount.root2,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.day.minutes.poly.2,PubDate.day.minutes.poly.3,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg
## Max.cor.Y.Time.Lag##rcv#glmnet WordCount.root2,NDSSName.my.fctr,PubDate.last2.log1p,PubDate.last4.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.last32.log1p,NDSSName.my.fctr:PubDate.last32.log1p.ctg,NDSSName.my.fctr:PubDate.last16.log1p.ctg,NDSSName.my.fctr:PubDate.last2.log1p.ctg,NDSSName.my.fctr:PubDate.last4.log1p.ctg,NDSSName.my.fctr:PubDate.last8.log1p.ctg
## Interact.High.cor.Y##rcv#glmnet WordCount.root2,NDSSName.my.fctr,WordCount.root2:WordCount.root2,WordCount.root2:PubDate.day.minutes.poly.1,WordCount.root2:PubDate.last4.log1p,WordCount.root2:PubDate.month.fctr
## Low.cor.X##rcv#glmnet WordCount.root2,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.last32.log1p.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.last16.log1p.ctg,NDSSName.my.fctr:PubDate.last2.log1p.ctg,NDSSName.my.fctr:PubDate.last4.log1p.ctg,NDSSName.my.fctr:PubDate.last8.log1p.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg
## All.X##rcv#glmnet WordCount.root2,WordCount.log1p,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.hour.fctr,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last2.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,PubDate.juliandate,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg
## max.nTuningRuns max.AUCpROC.fit
## MFO###myMFO_classfr 0 0.5000000
## Random###myrandom_classfr 0 0.4990604
## Max.cor.Y.rcv.1X1###glmnet 0 0.8790544
## Max.cor.Y.rcv.3X1##rcv#glmnet 25 0.8767919
## Max.cor.Y.rcv.3X3##rcv#glmnet 25 0.8767919
## Max.cor.Y.rcv.3X5##rcv#glmnet 25 0.8767919
## Max.cor.Y.rcv.5X1##rcv#glmnet 25 0.8784031
## Max.cor.Y.rcv.5X3##rcv#glmnet 25 0.8784031
## Max.cor.Y.rcv.5X5##rcv#glmnet 25 0.8784031
## Max.cor.Y.rcv.1X1.cp.0###rpart 0 0.8821543
## Max.cor.Y##rcv#rpart 5 0.8709432
## Max.cor.Y.Time.Poly##rcv#glmnet 25 0.8734975
## Max.cor.Y.Time.Lag##rcv#glmnet 5 0.8621088
## Interact.High.cor.Y##rcv#glmnet 25 0.8776419
## Low.cor.X##rcv#glmnet 5 0.8624894
## All.X##rcv#glmnet 5 0.8371661
## max.Sens.fit max.Spec.fit max.AUCROCR.fit
## MFO###myMFO_classfr 1.0000000 0.0000000 0.5000000
## Random###myrandom_classfr 0.8312611 0.1668598 0.4972757
## Max.cor.Y.rcv.1X1###glmnet 0.9632073 0.7949015 0.9608594
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.9644760 0.7891078 0.9582555
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.9644760 0.7891078 0.9582555
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.9644760 0.7891078 0.9582555
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.9642223 0.7925840 0.9607052
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.9642223 0.7925840 0.9607052
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.9642223 0.7925840 0.9607052
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.9705658 0.7937428 0.9504198
## Max.cor.Y##rcv#rpart 0.9632073 0.7786790 0.8746354
## Max.cor.Y.Time.Poly##rcv#glmnet 0.9659985 0.7809965 0.9534659
## Max.cor.Y.Time.Lag##rcv#glmnet 0.9652372 0.7589803 0.9558908
## Interact.High.cor.Y##rcv#glmnet 0.9626998 0.7925840 0.9625372
## Low.cor.X##rcv#glmnet 0.9659985 0.7589803 0.9588640
## All.X##rcv#glmnet 0.9779244 0.6964079 0.9604952
## opt.prob.threshold.fit max.f.score.fit
## MFO###myMFO_classfr 0.1 0.3045703
## Random###myrandom_classfr 0.1 0.3045703
## Max.cor.Y.rcv.1X1###glmnet 0.5 0.8099174
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.4 0.8099174
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.4 0.8099174
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.4 0.8099174
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.5 0.8104265
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.5 0.8104265
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.5 0.8104265
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.4 0.8235294
## Max.cor.Y##rcv#rpart 0.6 0.8000000
## Max.cor.Y.Time.Poly##rcv#glmnet 0.4 0.8103957
## Max.cor.Y.Time.Lag##rcv#glmnet 0.2 0.8075338
## Interact.High.cor.Y##rcv#glmnet 0.4 0.8084359
## Low.cor.X##rcv#glmnet 0.2 0.8077374
## All.X##rcv#glmnet 0.3 0.8070588
## max.Accuracy.fit max.Kappa.fit
## MFO###myMFO_classfr 0.1796420 0.0000000
## Random###myrandom_classfr 0.1796420 0.0000000
## Max.cor.Y.rcv.1X1###glmnet 0.9329725 0.7692476
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.9335973 0.7691678
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.9333193 0.7690803
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.9332218 0.7686375
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.9331818 0.7689055
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.9333905 0.7698577
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.9331816 0.7691429
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.9381765 0.7860827
## Max.cor.Y##rcv#rpart 0.9296422 0.7515134
## Max.cor.Y.Time.Poly##rcv#glmnet 0.9319320 0.7628290
## Max.cor.Y.Time.Lag##rcv#glmnet 0.9279769 0.7473218
## Interact.High.cor.Y##rcv#glmnet 0.9315850 0.7641040
## Low.cor.X##rcv#glmnet 0.9276303 0.7453708
## All.X##rcv#glmnet 0.9233293 0.7141966
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB
## MFO###myMFO_classfr 0.5000000 1.0000000 0.0000000
## Random###myrandom_classfr 0.5125675 0.8077437 0.2173913
## Max.cor.Y.rcv.1X1###glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.5962443 0.9098798 0.2826087
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.6174697 0.9218959 0.3130435
## Max.cor.Y##rcv#rpart 0.5870523 0.9045394 0.2695652
## Max.cor.Y.Time.Poly##rcv#glmnet 0.5950717 0.9118825 0.2782609
## Max.cor.Y.Time.Lag##rcv#glmnet 0.5927265 0.9158879 0.2695652
## Interact.High.cor.Y##rcv#glmnet 0.6009259 0.9105474 0.2913043
## Low.cor.X##rcv#glmnet 0.5917252 0.9138852 0.2695652
## All.X##rcv#glmnet 0.5873513 0.9399199 0.2347826
## max.AUCROCR.OOB opt.prob.threshold.OOB
## MFO###myMFO_classfr 0.5000000 0.1
## Random###myrandom_classfr 0.4857956 0.1
## Max.cor.Y.rcv.1X1###glmnet 0.8116126 0.1
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.8067975 0.1
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.8067975 0.1
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.8067975 0.1
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.8114863 0.1
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.8114863 0.1
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.8114863 0.1
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.7773858 0.1
## Max.cor.Y##rcv#rpart 0.5892132 0.6
## Max.cor.Y.Time.Poly##rcv#glmnet 0.7997373 0.1
## Max.cor.Y.Time.Lag##rcv#glmnet 0.8024758 0.1
## Interact.High.cor.Y##rcv#glmnet 0.8140971 0.1
## Low.cor.X##rcv#glmnet 0.8052766 0.1
## All.X##rcv#glmnet 0.8129999 0.1
## max.f.score.OOB max.Accuracy.OOB
## MFO###myMFO_classfr 0.2349336 0.1331019
## Random###myrandom_classfr 0.2349336 0.1331019
## Max.cor.Y.rcv.1X1###glmnet 0.4405405 0.7604167
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.4375839 0.7575231
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.4375839 0.7575231
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.4375839 0.7575231
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.4609375 0.7604167
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.4609375 0.7604167
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.4609375 0.7604167
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.4207493 0.7673611
## Max.cor.Y##rcv#rpart 0.2850575 0.8200231
## Max.cor.Y.Time.Poly##rcv#glmnet 0.4191617 0.7754630
## Max.cor.Y.Time.Lag##rcv#glmnet 0.4118565 0.7818287
## Interact.High.cor.Y##rcv#glmnet 0.4398340 0.7656250
## Low.cor.X##rcv#glmnet 0.4152672 0.7783565
## All.X##rcv#glmnet 0.3871577 0.6244213
## max.Kappa.OOB inv.elapsedtime.everything
## MFO###myMFO_classfr 0.0000000 3.37837838
## Random###myrandom_classfr 0.0000000 3.32225914
## Max.cor.Y.rcv.1X1###glmnet 0.3148374 0.95147479
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.3107477 0.39401103
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.3107477 0.21427041
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.3107477 0.14140271
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.3373693 0.28328612
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.3373693 0.13486177
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.3373693 0.10044194
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.2953321 1.00603622
## Max.cor.Y##rcv#rpart 0.1825002 0.28417164
## Max.cor.Y.Time.Poly##rcv#glmnet 0.2963401 0.07603984
## Max.cor.Y.Time.Lag##rcv#glmnet 0.2908072 0.02371017
## Interact.High.cor.Y##rcv#glmnet 0.3156027 0.18504811
## Low.cor.X##rcv#glmnet 0.2931798 0.01290472
## All.X##rcv#glmnet 0.2258300 0.05708414
## inv.elapsedtime.final
## MFO###myMFO_classfr 250.0000000
## Random###myrandom_classfr 500.0000000
## Max.cor.Y.rcv.1X1###glmnet 3.5842294
## Max.cor.Y.rcv.3X1##rcv#glmnet 3.6630037
## Max.cor.Y.rcv.3X3##rcv#glmnet 3.6496350
## Max.cor.Y.rcv.3X5##rcv#glmnet 3.6231884
## Max.cor.Y.rcv.5X1##rcv#glmnet 3.6101083
## Max.cor.Y.rcv.5X3##rcv#glmnet 3.5087719
## Max.cor.Y.rcv.5X5##rcv#glmnet 3.6363636
## Max.cor.Y.rcv.1X1.cp.0###rpart 13.1578947
## Max.cor.Y##rcv#rpart 13.5135135
## Max.cor.Y.Time.Poly##rcv#glmnet 0.5892752
## Max.cor.Y.Time.Lag##rcv#glmnet 0.3539823
## Interact.High.cor.Y##rcv#glmnet 2.9239766
## Low.cor.X##rcv#glmnet 0.2055921
## All.X##rcv#glmnet 0.6353240
print(myplot_radar(radar_inp_df=plt_models_df))
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 16. Consider specifying shapes manually if you must have them.
## Warning: Removed 200 rows containing missing values (geom_point).
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 16. Consider specifying shapes manually if you must have them.
# print(myplot_radar(radar_inp_df=subset(plt_models_df,
# !(mdl_id %in% grep("random|MFO", plt_models_df$id, value=TRUE)))))
# Compute CI for <metric>SD
glb_models_df <- mutate(glb_models_df,
max.df = ifelse(max.nTuningRuns > 1, max.nTuningRuns - 1, NA),
min.sd2ci.scaler = ifelse(is.na(max.df), NA, qt(0.975, max.df)))
for (var in grep("SD", names(glb_models_df), value=TRUE)) {
# Does CI alredy exist ?
var_components <- unlist(strsplit(var, "SD"))
varActul <- paste0(var_components[1], var_components[2])
varUpper <- paste0(var_components[1], "Upper", var_components[2])
varLower <- paste0(var_components[1], "Lower", var_components[2])
if (varUpper %in% names(glb_models_df)) {
warning(varUpper, " already exists in glb_models_df")
# Assuming Lower also exists
next
}
print(sprintf("var:%s", var))
# CI is dependent on sample size in t distribution; df=n-1
glb_models_df[, varUpper] <- glb_models_df[, varActul] +
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
glb_models_df[, varLower] <- glb_models_df[, varActul] -
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
}
## Warning: max.AccuracyUpper.fit already exists in glb_models_df
## [1] "var:max.KappaSD.fit"
# Plot metrics with CI
plt_models_df <- glb_models_df[, "id", FALSE]
pltCI_models_df <- glb_models_df[, "id", FALSE]
for (var in grep("Upper", names(glb_models_df), value=TRUE)) {
var_components <- unlist(strsplit(var, "Upper"))
col_name <- unlist(paste(var_components, collapse=""))
plt_models_df[, col_name] <- glb_models_df[, col_name]
for (name in paste0(var_components[1], c("Upper", "Lower"), var_components[2]))
pltCI_models_df[, name] <- glb_models_df[, name]
}
build_statsCI_data <- function(plt_models_df) {
mltd_models_df <- melt(plt_models_df, id.vars="id")
mltd_models_df$data <- sapply(1:nrow(mltd_models_df),
function(row_ix) tail(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]), "[.]")), 1))
mltd_models_df$label <- sapply(1:nrow(mltd_models_df),
function(row_ix) head(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]),
paste0(".", mltd_models_df[row_ix, "data"]))), 1))
#print(mltd_models_df)
return(mltd_models_df)
}
mltd_models_df <- build_statsCI_data(plt_models_df)
mltdCI_models_df <- melt(pltCI_models_df, id.vars="id")
for (row_ix in 1:nrow(mltdCI_models_df)) {
for (type in c("Upper", "Lower")) {
if (length(var_components <- unlist(strsplit(
as.character(mltdCI_models_df[row_ix, "variable"]), type))) > 1) {
#print(sprintf("row_ix:%d; type:%s; ", row_ix, type))
mltdCI_models_df[row_ix, "label"] <- var_components[1]
mltdCI_models_df[row_ix, "data"] <-
unlist(strsplit(var_components[2], "[.]"))[2]
mltdCI_models_df[row_ix, "type"] <- type
break
}
}
}
wideCI_models_df <- reshape(subset(mltdCI_models_df, select=-variable),
timevar="type",
idvar=setdiff(names(mltdCI_models_df), c("type", "value", "variable")),
direction="wide")
#print(wideCI_models_df)
mrgdCI_models_df <- merge(wideCI_models_df, mltd_models_df, all.x=TRUE)
#print(mrgdCI_models_df)
# Merge stats back in if CIs don't exist
goback_vars <- c()
for (var in unique(mltd_models_df$label)) {
for (type in unique(mltd_models_df$data)) {
var_type <- paste0(var, ".", type)
# if this data is already present, next
if (var_type %in% unique(paste(mltd_models_df$label, mltd_models_df$data,
sep=".")))
next
#print(sprintf("var_type:%s", var_type))
goback_vars <- c(goback_vars, var_type)
}
}
if (length(goback_vars) > 0) {
mltd_goback_df <- build_statsCI_data(glb_models_df[, c("id", goback_vars)])
mltd_models_df <- rbind(mltd_models_df, mltd_goback_df)
}
# mltd_models_df <- merge(mltd_models_df, glb_models_df[, c("id", "model_method")],
# all.x=TRUE)
png(paste0(glb_out_pfx, "models_bar.png"), width=480*3, height=480*2)
#print(gp <- myplot_bar(mltd_models_df, "id", "value", colorcol_name="model_method") +
print(gp <- myplot_bar(df=mltd_models_df, xcol_name="id", ycol_names="value") +
geom_errorbar(data=mrgdCI_models_df,
mapping=aes(x=mdl_id, ymax=value.Upper, ymin=value.Lower), width=0.5) +
facet_grid(label ~ data, scales="free") +
theme(axis.text.x = element_text(angle = 90,vjust = 0.5)))
## Warning: Removed 4 rows containing missing values (geom_errorbar).
dev.off()
## quartz_off_screen
## 2
print(gp)
## Warning: Removed 4 rows containing missing values (geom_errorbar).
dsp_models_cols <- c("id",
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)],
grep("opt.", names(glb_models_df), fixed = TRUE, value = TRUE))
# if (glb_is_classification && glb_is_binomial)
# dsp_models_cols <- c(dsp_models_cols, "opt.prob.threshold.OOB")
print(dsp_models_df <- orderBy(get_model_sel_frmla(), glb_models_df)[, dsp_models_cols])
## id
## Max.cor.Y##rcv#rpart Max.cor.Y##rcv#rpart
## Max.cor.Y.Time.Lag##rcv#glmnet Max.cor.Y.Time.Lag##rcv#glmnet
## Low.cor.X##rcv#glmnet Low.cor.X##rcv#glmnet
## Max.cor.Y.Time.Poly##rcv#glmnet Max.cor.Y.Time.Poly##rcv#glmnet
## Max.cor.Y.rcv.1X1.cp.0###rpart Max.cor.Y.rcv.1X1.cp.0###rpart
## Interact.High.cor.Y##rcv#glmnet Interact.High.cor.Y##rcv#glmnet
## Max.cor.Y.rcv.1X1###glmnet Max.cor.Y.rcv.1X1###glmnet
## Max.cor.Y.rcv.5X3##rcv#glmnet Max.cor.Y.rcv.5X3##rcv#glmnet
## Max.cor.Y.rcv.5X1##rcv#glmnet Max.cor.Y.rcv.5X1##rcv#glmnet
## Max.cor.Y.rcv.5X5##rcv#glmnet Max.cor.Y.rcv.5X5##rcv#glmnet
## Max.cor.Y.rcv.3X1##rcv#glmnet Max.cor.Y.rcv.3X1##rcv#glmnet
## Max.cor.Y.rcv.3X3##rcv#glmnet Max.cor.Y.rcv.3X3##rcv#glmnet
## Max.cor.Y.rcv.3X5##rcv#glmnet Max.cor.Y.rcv.3X5##rcv#glmnet
## All.X##rcv#glmnet All.X##rcv#glmnet
## MFO###myMFO_classfr MFO###myMFO_classfr
## Random###myrandom_classfr Random###myrandom_classfr
## max.Accuracy.OOB max.AUCROCR.OOB
## Max.cor.Y##rcv#rpart 0.8200231 0.5892132
## Max.cor.Y.Time.Lag##rcv#glmnet 0.7818287 0.8024758
## Low.cor.X##rcv#glmnet 0.7783565 0.8052766
## Max.cor.Y.Time.Poly##rcv#glmnet 0.7754630 0.7997373
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.7673611 0.7773858
## Interact.High.cor.Y##rcv#glmnet 0.7656250 0.8140971
## Max.cor.Y.rcv.1X1###glmnet 0.7604167 0.8116126
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.7604167 0.8114863
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.7604167 0.8114863
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.7604167 0.8114863
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.7575231 0.8067975
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.7575231 0.8067975
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.7575231 0.8067975
## All.X##rcv#glmnet 0.6244213 0.8129999
## MFO###myMFO_classfr 0.1331019 0.5000000
## Random###myrandom_classfr 0.1331019 0.4857956
## max.AUCpROC.OOB max.Accuracy.fit
## Max.cor.Y##rcv#rpart 0.5870523 0.9296422
## Max.cor.Y.Time.Lag##rcv#glmnet 0.5927265 0.9279769
## Low.cor.X##rcv#glmnet 0.5917252 0.9276303
## Max.cor.Y.Time.Poly##rcv#glmnet 0.5950717 0.9319320
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.6174697 0.9381765
## Interact.High.cor.Y##rcv#glmnet 0.6009259 0.9315850
## Max.cor.Y.rcv.1X1###glmnet 0.5962443 0.9329725
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.5962443 0.9333905
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.5962443 0.9331818
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.5962443 0.9331816
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.5962443 0.9335973
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.5962443 0.9333193
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.5962443 0.9332218
## All.X##rcv#glmnet 0.5873513 0.9233293
## MFO###myMFO_classfr 0.5000000 0.1796420
## Random###myrandom_classfr 0.5125675 0.1796420
## opt.prob.threshold.fit
## Max.cor.Y##rcv#rpart 0.6
## Max.cor.Y.Time.Lag##rcv#glmnet 0.2
## Low.cor.X##rcv#glmnet 0.2
## Max.cor.Y.Time.Poly##rcv#glmnet 0.4
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.4
## Interact.High.cor.Y##rcv#glmnet 0.4
## Max.cor.Y.rcv.1X1###glmnet 0.5
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.5
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.5
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.5
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.4
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.4
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.4
## All.X##rcv#glmnet 0.3
## MFO###myMFO_classfr 0.1
## Random###myrandom_classfr 0.1
## opt.prob.threshold.OOB
## Max.cor.Y##rcv#rpart 0.6
## Max.cor.Y.Time.Lag##rcv#glmnet 0.1
## Low.cor.X##rcv#glmnet 0.1
## Max.cor.Y.Time.Poly##rcv#glmnet 0.1
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.1
## Interact.High.cor.Y##rcv#glmnet 0.1
## Max.cor.Y.rcv.1X1###glmnet 0.1
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.1
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.1
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.1
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.1
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.1
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.1
## All.X##rcv#glmnet 0.1
## MFO###myMFO_classfr 0.1
## Random###myrandom_classfr 0.1
print(myplot_radar(radar_inp_df = dsp_models_df))
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 16. Consider specifying shapes manually if you must have them.
## Warning: Removed 70 rows containing missing values (geom_point).
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 16. Consider specifying shapes manually if you must have them.
print("Metrics used for model selection:"); print(get_model_sel_frmla())
## [1] "Metrics used for model selection:"
## ~-max.Accuracy.OOB - max.AUCROCR.OOB - max.AUCpROC.OOB - max.Accuracy.fit -
## opt.prob.threshold.OOB
## <environment: 0x7f8dc1cb6200>
print(sprintf("Best model id: %s", dsp_models_df[1, "id"]))
## [1] "Best model id: Max.cor.Y##rcv#rpart"
glb_get_predictions <- function(df, mdl_id, rsp_var, prob_threshold_def=NULL, verbose=FALSE) {
mdl <- glb_models_lst[[mdl_id]]
clmnNames <- mygetPredictIds(rsp_var, mdl_id)
predct_var_name <- clmnNames$value
predct_prob_var_name <- clmnNames$prob
predct_accurate_var_name <- clmnNames$is.acc
predct_error_var_name <- clmnNames$err
predct_erabs_var_name <- clmnNames$err.abs
if (glb_is_regression) {
df[, predct_var_name] <- predict(mdl, newdata=df, type="raw")
if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_var_name) +
facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
stat_smooth(method="glm"))
df[, predct_error_var_name] <- df[, predct_var_name] - df[, glb_rsp_var]
if (verbose) print(myplot_scatter(df, predct_var_name, predct_error_var_name) +
#facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
stat_smooth(method="auto"))
if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_error_var_name) +
#facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
stat_smooth(method="glm"))
df[, predct_erabs_var_name] <- abs(df[, predct_error_var_name])
if (verbose) print(head(orderBy(reformulate(c("-", predct_erabs_var_name)), df)))
df[, predct_accurate_var_name] <- (df[, glb_rsp_var] == df[, predct_var_name])
}
if (glb_is_classification && glb_is_binomial) {
prob_threshold <- glb_models_df[glb_models_df$id == mdl_id,
"opt.prob.threshold.OOB"]
if (is.null(prob_threshold) || is.na(prob_threshold)) {
warning("Using default probability threshold: ", prob_threshold_def)
if (is.null(prob_threshold <- prob_threshold_def))
stop("Default probability threshold is NULL")
}
df[, predct_prob_var_name] <- predict(mdl, newdata = df, type = "prob")[, 2]
df[, predct_var_name] <-
factor(levels(df[, glb_rsp_var])[
(df[, predct_prob_var_name] >=
prob_threshold) * 1 + 1], levels(df[, glb_rsp_var]))
# if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_var_name) +
# facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
# stat_smooth(method="glm"))
df[, predct_error_var_name] <- df[, predct_var_name] != df[, glb_rsp_var]
# if (verbose) print(myplot_scatter(df, predct_var_name, predct_error_var_name) +
# #facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
# stat_smooth(method="auto"))
# if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_error_var_name) +
# #facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
# stat_smooth(method="glm"))
# if prediction is a TP (true +ve), measure distance from 1.0
tp <- which((df[, predct_var_name] == df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[2]))
df[tp, predct_erabs_var_name] <- abs(1 - df[tp, predct_prob_var_name])
#rowIx <- which.max(df[tp, predct_erabs_var_name]); df[tp, c(glb_id_var, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a TN (true -ve), measure distance from 0.0
tn <- which((df[, predct_var_name] == df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[1]))
df[tn, predct_erabs_var_name] <- abs(0 - df[tn, predct_prob_var_name])
#rowIx <- which.max(df[tn, predct_erabs_var_name]); df[tn, c(glb_id_var, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a FP (flse +ve), measure distance from 0.0
fp <- which((df[, predct_var_name] != df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[2]))
df[fp, predct_erabs_var_name] <- abs(0 - df[fp, predct_prob_var_name])
#rowIx <- which.max(df[fp, predct_erabs_var_name]); df[fp, c(glb_id_var, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a FN (flse -ve), measure distance from 1.0
fn <- which((df[, predct_var_name] != df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[1]))
df[fn, predct_erabs_var_name] <- abs(1 - df[fn, predct_prob_var_name])
#rowIx <- which.max(df[fn, predct_erabs_var_name]); df[fn, c(glb_id_var, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
if (verbose) print(head(orderBy(reformulate(c("-", predct_erabs_var_name)), df)))
df[, predct_accurate_var_name] <- (df[, glb_rsp_var] == df[, predct_var_name])
}
if (glb_is_classification && !glb_is_binomial) {
df[, predct_var_name] <- predict(mdl, newdata = df, type = "raw")
df[, paste0(predct_var_name, ".prob")] <-
predict(mdl, newdata = df, type = "prob")
stop("Multinomial prediction error calculation needs to be implemented...")
}
return(df)
}
#stop(here"); glb2Sav(); glbObsAll <- savObsAll; glbObsTrn <- savObsTrn; glbObsFit <- savObsFit; glbObsOOB <- savObsOOB; sav_models_df <- glb_models_df; glb_models_df <- sav_models_df; glb_featsimp_df <- sav_featsimp_df
myget_category_stats <- function(obs_df, mdl_id, label) {
require(dplyr)
require(lazyeval)
predct_var_name <- mygetPredictIds(glb_rsp_var, mdl_id)$value
predct_error_var_name <- mygetPredictIds(glb_rsp_var, mdl_id)$err.abs
if (!predct_var_name %in% names(obs_df))
obs_df <- glb_get_predictions(obs_df, mdl_id, glb_rsp_var)
tmp_obs_df <- obs_df[, c(glbFeatsCategory, glb_rsp_var,
predct_var_name, predct_error_var_name)]
# tmp_obs_df <- obs_df %>%
# dplyr::select_(glbFeatsCategory, glb_rsp_var, predct_var_name, predct_error_var_name)
#dplyr::rename(startprice.log10.predict.RFE.X.glmnet.err=error_abs_OOB)
names(tmp_obs_df)[length(names(tmp_obs_df))] <- paste0("err.abs.", label)
ret_ctgry_df <- tmp_obs_df %>%
dplyr::group_by_(glbFeatsCategory) %>%
dplyr::summarise_(#interp(~sum(abs(var)), var=as.name(glb_rsp_var)),
interp(~sum(var), var=as.name(paste0("err.abs.", label))),
interp(~mean(var), var=as.name(paste0("err.abs.", label))),
interp(~n()))
names(ret_ctgry_df) <- c(glbFeatsCategory,
#paste0(glb_rsp_var, ".abs.", label, ".sum"),
paste0("err.abs.", label, ".sum"),
paste0("err.abs.", label, ".mean"),
paste0(".n.", label))
ret_ctgry_df <- dplyr::ungroup(ret_ctgry_df)
#colSums(ret_ctgry_df[, -grep(glbFeatsCategory, names(ret_ctgry_df))])
return(ret_ctgry_df)
}
#print(colSums((ctgry_df <- myget_category_stats(obs_df=glbObsFit, mdl_id="", label="fit"))[, -grep(glbFeatsCategory, names(ctgry_df))]))
if (!is.null(glb_mdl_ensemble)) {
fit.models_2_chunk_df <- myadd_chunk(fit.models_2_chunk_df,
paste0("fit.models_2_", mdl_id_pfx), major.inc = TRUE,
label.minor = "ensemble")
mdl_id_pfx <- "Ensemble"
if (#(glb_is_regression) |
((glb_is_classification) & (!glb_is_binomial)))
stop("Ensemble models not implemented yet for multinomial classification")
mygetEnsembleAutoMdlIds <- function() {
tmp_models_df <- orderBy(get_model_sel_frmla(), glb_models_df)
row.names(tmp_models_df) <- tmp_models_df$id
mdl_threshold_pos <-
min(which(grepl("MFO|Random|Baseline", tmp_models_df$id))) - 1
mdlIds <- tmp_models_df$id[1:mdl_threshold_pos]
return(mdlIds[!grepl("Ensemble", mdlIds)])
}
if (glb_mdl_ensemble == "auto") {
glb_mdl_ensemble <- mygetEnsembleAutoMdlIds()
mdl_id_pfx <- paste0(mdl_id_pfx, ".auto")
} else if (grepl("^%<d-%", glb_mdl_ensemble)) {
glb_mdl_ensemble <- eval(parse(text =
str_trim(unlist(strsplit(glb_mdl_ensemble, "%<d-%"))[2])))
}
for (mdl_id in glb_mdl_ensemble) {
if (!(mdl_id %in% names(glb_models_lst))) {
warning("Model ", mdl_id, " in glb_model_ensemble not found !")
next
}
glbObsFit <- glb_get_predictions(df = glbObsFit, mdl_id, glb_rsp_var)
glbObsOOB <- glb_get_predictions(df = glbObsOOB, mdl_id, glb_rsp_var)
}
#mdl_id_pfx <- "Ensemble.RFE"; mdlId <- paste0(mdl_id_pfx, ".glmnet")
#glb_mdl_ensemble <- gsub(mygetPredictIds$value, "", grep("RFE\\.X\\.(?!Interact)", row.names(glb_featsimp_df), perl = TRUE, value = TRUE), fixed = TRUE)
#varImp(glb_models_lst[[mdlId]])
#cor_df <- data.frame(cor=cor(glbObsFit[, glb_rsp_var], glbObsFit[, paste(mygetPredictIds$value, glb_mdl_ensemble)], use="pairwise.complete.obs"))
#glbObsFit <- glb_get_predictions(df=glbObsFit, "Ensemble.glmnet", glb_rsp_var);print(colSums((ctgry_df <- myget_category_stats(obs_df=glbObsFit, mdl_id="Ensemble.glmnet", label="fit"))[, -grep(glbFeatsCategory, names(ctgry_df))]))
### bid0_sp
# Better than MFO; models.n=28; min.RMSE.fit=0.0521233; err.abs.fit.sum=7.3631895
# old: Top x from auto; models.n= 5; min.RMSE.fit=0.06311047; err.abs.fit.sum=9.5937080
# RFE only ; models.n=16; min.RMSE.fit=0.05148588; err.abs.fit.sum=7.2875091
# RFE subset only ;models.n= 5; min.RMSE.fit=0.06040702; err.abs.fit.sum=9.059088
# RFE subset only ;models.n= 9; min.RMSE.fit=0.05933167; err.abs.fit.sum=8.7421288
# RFE subset only ;models.n=15; min.RMSE.fit=0.0584607; err.abs.fit.sum=8.5902066
# RFE subset only ;models.n=17; min.RMSE.fit=0.05496899; err.abs.fit.sum=8.0170431
# RFE subset only ;models.n=18; min.RMSE.fit=0.05441577; err.abs.fit.sum=7.837223
# RFE subset only ;models.n=16; min.RMSE.fit=0.05441577; err.abs.fit.sum=7.837223
### bid0_sp
### bid1_sp
# "auto"; err.abs.fit.sum=76.699774; min.RMSE.fit=0.2186429
# "RFE.X.*"; err.abs.fit.sum=; min.RMSE.fit=0.221114
### bid1_sp
indep_vars <- paste(mygetPredictIds(glb_rsp_var)$value, glb_mdl_ensemble, sep = "")
if (glb_is_classification)
indep_vars <- paste(indep_vars, ".prob", sep = "")
# Some models in glb_mdl_ensemble might not be fitted e.g. RFE.X.Interact
indep_vars <- intersect(indep_vars, names(glbObsFit))
# indep_vars <- grep(mygetPredictIds(glb_rsp_var)$value, names(glbObsFit), fixed=TRUE, value=TRUE)
# if (glb_is_regression)
# indep_vars <- indep_vars[!grepl("(err\\.abs|accurate)$", indep_vars)]
# if (glb_is_classification && glb_is_binomial)
# indep_vars <- grep("prob$", indep_vars, value=TRUE) else
# indep_vars <- indep_vars[!grepl("err$", indep_vars)]
#rfe_fit_ens_results <- myrun_rfe(glbObsFit, indep_vars)
for (method in c("glm", "glmnet")) {
for (trainControlMethod in
c("boot", "boot632", "cv", "repeatedcv"
#, "LOOCV" # tuneLength * nrow(fitDF)
, "LGOCV", "adaptive_cv"
#, "adaptive_boot" #error: adaptive$min should be less than 3
#, "adaptive_LGOCV" #error: adaptive$min should be less than 3
)) {
#sav_models_df <- glb_models_df; all.equal(sav_models_df, glb_models_df)
#glb_models_df <- sav_models_df; print(glb_models_df$id)
if ((method == "glm") && (trainControlMethod != "repeatedcv"))
# glm used only to identify outliers
next
ret_lst <- myfit_mdl(
mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = paste0(mdl_id_pfx, ".", trainControlMethod),
type = glb_model_type, tune.df = NULL,
trainControl.method = trainControlMethod,
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = method)),
indep_vars = indep_vars, rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
}
dsp_models_df <- get_dsp_models_df()
}
if (is.null(glb_sel_mdl_id))
glb_sel_mdl_id <- dsp_models_df[1, "id"] else
print(sprintf("User specified selection: %s", glb_sel_mdl_id))
## [1] "User specified selection: All.X##rcv#glmnet"
myprint_mdl(glb_sel_mdl <- glb_models_lst[[glb_sel_mdl_id]])
## Length Class Mode
## a0 100 -none- numeric
## beta 16200 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 162 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.70728068
## NDSSName.my.fctr#Multimedia#
## -0.03057558
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.60940339
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 1.94290987
## NDSSName.my.fctr#U.S.#Education
## -0.26695651
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.16586946
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.12932205
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 2.23109377
## NDSSName.my.fctrCulture#Arts#
## -0.17319452
## NDSSName.my.fctrForeign#World#
## -0.14135573
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.30346668
## NDSSName.my.fctrOpEd#Opinion#
## 2.48538732
## NDSSName.my.fctrScience#Health#
## 1.99127252
## NDSSName.my.fctrStyles##Fashion
## -0.25250399
## NDSSName.my.fctrStyles#U.S.#
## 1.82412410
## NDSSName.my.fctrTStyle##
## -0.42090722
## NDSSName.my.fctrTravel#Travel#
## -0.11316375
## PubDate.day.minutes.poly.1
## 9.79096645
## PubDate.day.minutes.poly.2
## 1.79653308
## PubDate.day.minutes.poly.4
## 3.98793812
## PubDate.hour.fctr(15.3,23]
## 0.03978968
## PubDate.last2.log1p
## 0.01083809
## PubDate.last4.log1p
## 0.01693989
## PubDate.wkend
## 0.15569751
## WordCount.log1p
## 0.14959363
## WordCount.root2
## 0.02370264
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.1.ctg
## -0.59811229
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.1.ctg
## 0.46880580
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.2.ctg
## -3.15153855
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg
## 4.31724063
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.3.ctg
## 0.70315244
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.4.ctg
## 0.44783900
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.5.ctg
## 1.13558865
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.832271960
## NDSSName.my.fctr#Multimedia#
## -0.057362968
## NDSSName.my.fctr#Opinion#RoomForDebate
## -0.690208382
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 2.048782794
## NDSSName.my.fctr#U.S.#Education
## -0.296522166
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook
## -0.177201520
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness
## -0.157735256
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 2.308574637
## NDSSName.my.fctrCulture#Arts#
## -0.187159278
## NDSSName.my.fctrForeign#World#
## -0.169354647
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.340667898
## NDSSName.my.fctrOpEd#Opinion#
## 2.572424625
## NDSSName.my.fctrScience#Health#
## 2.067845584
## NDSSName.my.fctrStyles##Fashion
## -0.291175749
## NDSSName.my.fctrStyles#U.S.#
## 1.900531124
## NDSSName.my.fctrTStyle##
## -0.447496009
## NDSSName.my.fctrTravel#Travel#
## -0.145402163
## NDSSName.my.fctrmyOther
## -0.020947092
## PubDate.day.minutes.poly.1
## 10.186502096
## PubDate.day.minutes.poly.2
## 2.103810556
## PubDate.day.minutes.poly.4
## 4.337265373
## PubDate.hour.fctr(15.3,23]
## 0.041843553
## PubDate.last2.log1p
## 0.012487863
## PubDate.last4.log1p
## 0.018581475
## PubDate.last8.log1p
## 0.001363957
## PubDate.wkend
## 0.166423165
## WordCount.log1p
## 0.156206169
## WordCount.root2
## 0.024718690
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.1.ctg
## -0.765331026
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.1.ctg
## 0.719908949
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.2.ctg
## -4.452382355
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.2.ctg
## 0.043369062
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg
## 5.091693780
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.3.ctg
## 0.851577628
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.4.ctg
## 0.693318411
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.5.ctg
## 1.932426590
## [1] TRUE
# From here to save(), this should all be in one function
# these are executed in the same seq twice more:
# fit.data.training & predict.data.new chunks
print(sprintf("%s fit prediction diagnostics:", glb_sel_mdl_id))
## [1] "All.X##rcv#glmnet fit prediction diagnostics:"
glbObsFit <- glb_get_predictions(df = glbObsFit, mdl_id = glb_sel_mdl_id,
rsp_var = glb_rsp_var)
print(sprintf("%s OOB prediction diagnostics:", glb_sel_mdl_id))
## [1] "All.X##rcv#glmnet OOB prediction diagnostics:"
glbObsOOB <- glb_get_predictions(df = glbObsOOB, mdl_id = glb_sel_mdl_id,
rsp_var = glb_rsp_var)
glb_featsimp_df <-
myget_feats_importance(mdl=glb_sel_mdl, featsimp_df=NULL)
glb_featsimp_df[, paste0(glb_sel_mdl_id, ".imp")] <- glb_featsimp_df$imp
#mdl_id <-"RFE.X.glmnet"; glb_featsimp_df <- myget_feats_importance(glb_models_lst[[mdl_id]], glb_featsimp_df); glb_featsimp_df[, paste0(mdl_id, ".imp")] <- glb_featsimp_df$imp; print(glb_featsimp_df)
#print(head(sbst_featsimp_df <- subset(glb_featsimp_df, is.na(RFE.X.glmnet.imp) | (abs(RFE.X.YeoJohnson.glmnet.imp - RFE.X.glmnet.imp) > 0.0001), select=-imp)))
#print(orderBy(~ -cor.y.abs, subset(glb_feats_df, id %in% c(row.names(sbst_featsimp_df), "startprice.dcm1.is9", "D.weight.post.stop.sum"))))
print(glb_featsimp_df)
## imp
## PubDate.day.minutes.poly.1 100.00000
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg 63.88597
## PubDate.day.minutes.poly.4 59.18904
## NDSSName.my.fctrOpEd#Opinion# 47.21133
## NDSSName.my.fctrBusiness#Crosswords/Games# 45.38051
## NDSSName.my.fctrScience#Health# 43.69957
## PubDate.day.minutes.poly.2 43.63891
## NDSSName.my.fctr#Opinion#ThePublicEditor 43.52674
## NDSSName.my.fctrStyles#U.S.# 42.53063
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.5.ctg 41.77929
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.3.ctg 35.10334
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.1.ctg 34.04441
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.4.ctg 33.86620
## PubDate.wkend 30.50180
## WordCount.log1p 30.43597
## PubDate.hour.fctr(15.3,23] 29.64299
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.2.ctg 29.59778
## WordCount.root2 29.52473
## PubDate.last4.log1p 29.48100
## PubDate.last2.log1p 29.43840
## PubDate.last8.log1p 29.36106
## .rnorm 29.35337
## NDSSName.my.fctrBusiness#Technology# 29.35337
## NDSSName.my.fctrCulture## 29.35337
## NDSSName.my.fctrMetro#N.Y./Region# 29.35337
## PubDate.date.fctr(7,13] 29.35337
## PubDate.date.fctr(13,19] 29.35337
## PubDate.date.fctr(19,25] 29.35337
## PubDate.date.fctr(25,31] 29.35337
## PubDate.day.minutes.poly.3 29.35337
## PubDate.day.minutes.poly.5 29.35337
## PubDate.hour.fctr(7.67,15.3] 29.35337
## PubDate.juliandate 29.35337
## PubDate.last16.log1p 29.35337
## PubDate.last32.log1p 29.35337
## PubDate.minute.fctr(14.8,29.5] 29.35337
## PubDate.minute.fctr(29.5,44.2] 29.35337
## PubDate.minute.fctr(44.2,59.1] 29.35337
## PubDate.month.fctr10 29.35337
## PubDate.month.fctr11 29.35337
## PubDate.month.fctr12 29.35337
## PubDate.second.fctr(14.8,29.5] 29.35337
## PubDate.second.fctr(29.5,44.2] 29.35337
## PubDate.second.fctr(44.2,59.1] 29.35337
## PubDate.wkday.fctr1 29.35337
## PubDate.wkday.fctr2 29.35337
## PubDate.wkday.fctr3 29.35337
## PubDate.wkday.fctr4 29.35337
## PubDate.wkday.fctr5 29.35337
## PubDate.wkday.fctr6 29.35337
## WordCount.nexp 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrmyOther 29.23532
## NDSSName.my.fctr#Multimedia# 28.98875
## NDSSName.my.fctrTravel#Travel# 28.38092
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness 28.28957
## NDSSName.my.fctrForeign#World# 28.20781
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook 28.13044
## NDSSName.my.fctrCulture#Arts# 28.06442
## NDSSName.my.fctrStyles##Fashion 27.37098
## NDSSName.my.fctr#U.S.#Education 27.32131
## NDSSName.my.fctrForeign#World#AsiaPacific 27.02315
## NDSSName.my.fctrTStyle## 26.26230
## NDSSName.my.fctr#Opinion#RoomForDebate 24.63959
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.1.ctg 24.23150
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.2.ctg 0.00000
## All.X##rcv#glmnet.imp
## PubDate.day.minutes.poly.1 100.00000
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg 63.88597
## PubDate.day.minutes.poly.4 59.18904
## NDSSName.my.fctrOpEd#Opinion# 47.21133
## NDSSName.my.fctrBusiness#Crosswords/Games# 45.38051
## NDSSName.my.fctrScience#Health# 43.69957
## PubDate.day.minutes.poly.2 43.63891
## NDSSName.my.fctr#Opinion#ThePublicEditor 43.52674
## NDSSName.my.fctrStyles#U.S.# 42.53063
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.5.ctg 41.77929
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.3.ctg 35.10334
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.1.ctg 34.04441
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.4.ctg 33.86620
## PubDate.wkend 30.50180
## WordCount.log1p 30.43597
## PubDate.hour.fctr(15.3,23] 29.64299
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.2.ctg 29.59778
## WordCount.root2 29.52473
## PubDate.last4.log1p 29.48100
## PubDate.last2.log1p 29.43840
## PubDate.last8.log1p 29.36106
## .rnorm 29.35337
## NDSSName.my.fctrBusiness#Technology# 29.35337
## NDSSName.my.fctrCulture## 29.35337
## NDSSName.my.fctrMetro#N.Y./Region# 29.35337
## PubDate.date.fctr(7,13] 29.35337
## PubDate.date.fctr(13,19] 29.35337
## PubDate.date.fctr(19,25] 29.35337
## PubDate.date.fctr(25,31] 29.35337
## PubDate.day.minutes.poly.3 29.35337
## PubDate.day.minutes.poly.5 29.35337
## PubDate.hour.fctr(7.67,15.3] 29.35337
## PubDate.juliandate 29.35337
## PubDate.last16.log1p 29.35337
## PubDate.last32.log1p 29.35337
## PubDate.minute.fctr(14.8,29.5] 29.35337
## PubDate.minute.fctr(29.5,44.2] 29.35337
## PubDate.minute.fctr(44.2,59.1] 29.35337
## PubDate.month.fctr10 29.35337
## PubDate.month.fctr11 29.35337
## PubDate.month.fctr12 29.35337
## PubDate.second.fctr(14.8,29.5] 29.35337
## PubDate.second.fctr(29.5,44.2] 29.35337
## PubDate.second.fctr(44.2,59.1] 29.35337
## PubDate.wkday.fctr1 29.35337
## PubDate.wkday.fctr2 29.35337
## PubDate.wkday.fctr3 29.35337
## PubDate.wkday.fctr4 29.35337
## PubDate.wkday.fctr5 29.35337
## PubDate.wkday.fctr6 29.35337
## WordCount.nexp 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrmyOther 29.23532
## NDSSName.my.fctr#Multimedia# 28.98875
## NDSSName.my.fctrTravel#Travel# 28.38092
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness 28.28957
## NDSSName.my.fctrForeign#World# 28.20781
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook 28.13044
## NDSSName.my.fctrCulture#Arts# 28.06442
## NDSSName.my.fctrStyles##Fashion 27.37098
## NDSSName.my.fctr#U.S.#Education 27.32131
## NDSSName.my.fctrForeign#World#AsiaPacific 27.02315
## NDSSName.my.fctrTStyle## 26.26230
## NDSSName.my.fctr#Opinion#RoomForDebate 24.63959
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.1.ctg 24.23150
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.2.ctg 0.00000
# Used again in fit.data.training & predict.data.new chunks
glb_analytics_diag_plots <- function(obs_df, mdl_id, prob_threshold=NULL) {
if (!is.null(featsimp_df <- glb_featsimp_df)) {
featsimp_df$feat <- gsub("`(.*?)`", "\\1", row.names(featsimp_df))
featsimp_df$feat.interact <- gsub("(.*?):(.*)", "\\2", featsimp_df$feat)
featsimp_df$feat <- gsub("(.*?):(.*)", "\\1", featsimp_df$feat)
featsimp_df$feat.interact <-
ifelse(featsimp_df$feat.interact == featsimp_df$feat,
NA, featsimp_df$feat.interact)
featsimp_df$feat <-
gsub("(.*?)\\.fctr(.*)", "\\1\\.fctr", featsimp_df$feat)
featsimp_df$feat.interact <-
gsub("(.*?)\\.fctr(.*)", "\\1\\.fctr", featsimp_df$feat.interact)
featsimp_df <- orderBy(~ -imp.max,
summaryBy(imp ~ feat + feat.interact, data=featsimp_df,
FUN=max))
#rex_str=":(.*)"; txt_vctr=tail(featsimp_df$feat); ret_lst <- regexec(rex_str, txt_vctr); ret_lst <- regmatches(txt_vctr, ret_lst); ret_vctr <- sapply(1:length(ret_lst), function(pos_ix) ifelse(length(ret_lst[[pos_ix]]) > 0, ret_lst[[pos_ix]], "")); print(ret_vctr <- ret_vctr[ret_vctr != ""])
featsimp_df <- subset(featsimp_df, !is.na(imp.max))
if (nrow(featsimp_df) > 5) {
warning("Limiting important feature scatter plots to 5 out of ",
nrow(featsimp_df))
featsimp_df <- head(featsimp_df, 5)
}
# if (!all(is.na(featsimp_df$feat.interact)))
# stop("not implemented yet")
rsp_var_out <- mygetPredictIds(glb_rsp_var, mdl_id)$value
for (var in featsimp_df$feat) {
plot_df <- melt(obs_df, id.vars = var,
measure.vars = c(glb_rsp_var, rsp_var_out))
print(myplot_scatter(plot_df, var, "value", colorcol_name = "variable",
facet_colcol_name = "variable", jitter = TRUE) +
guides(color = FALSE))
}
}
if (glb_is_regression) {
if (is.null(featsimp_df) || (nrow(featsimp_df) == 0))
warning("No important features in glb_fin_mdl") else
print(myplot_prediction_regression(df=obs_df,
feat_x=ifelse(nrow(featsimp_df) > 1, featsimp_df$feat[2],
".rownames"),
feat_y=featsimp_df$feat[1],
rsp_var=glb_rsp_var, rsp_var_out=rsp_var_out,
id_vars=glb_id_var)
# + facet_wrap(reformulate(featsimp_df$feat[2])) # if [1 or 2] is a factor
# + geom_point(aes_string(color="<col_name>.fctr")) # to color the plot
)
}
if (glb_is_classification) {
if (is.null(featsimp_df) || (nrow(featsimp_df) == 0))
warning("No features in selected model are statistically important")
else print(myplot_prediction_classification(df = obs_df,
feat_x = ifelse(nrow(featsimp_df) > 1,
featsimp_df$feat[2], ".rownames"),
feat_y = featsimp_df$feat[1],
rsp_var = glb_rsp_var,
rsp_var_out = rsp_var_out,
id_vars = glb_id_var,
prob_threshold = prob_threshold))
}
}
if (glb_is_classification && glb_is_binomial)
glb_analytics_diag_plots(obs_df = glbObsOOB, mdl_id = glb_sel_mdl_id,
prob_threshold = glb_models_df[glb_models_df$id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"]) else
glb_analytics_diag_plots(obs_df = glbObsOOB, mdl_id = glb_sel_mdl_id)
## Warning in glb_analytics_diag_plots(obs_df = glbObsOOB, mdl_id =
## glb_sel_mdl_id, : Limiting important feature scatter plots to 5 out of 28
## [1] "Min/Max Boundaries: "
## UniqueID Popular.fctr Popular.fctr.All.X..rcv.glmnet.prob
## 1 2555 N 0.02759794
## 2 302 N 0.25386138
## Popular.fctr.All.X..rcv.glmnet Popular.fctr.All.X..rcv.glmnet.err
## 1 N FALSE
## 2 Y TRUE
## Popular.fctr.All.X..rcv.glmnet.err.abs
## 1 0.02759794
## 2 0.25386138
## Popular.fctr.All.X..rcv.glmnet.is.acc
## 1 TRUE
## 2 FALSE
## Popular.fctr.All.X..rcv.glmnet.accurate
## 1 TRUE
## 2 FALSE
## Popular.fctr.All.X..rcv.glmnet.error .label
## 1 0.0000000 2555
## 2 0.1538614 302
## [1] "Inaccurate: "
## UniqueID Popular.fctr Popular.fctr.All.X..rcv.glmnet.prob
## 1 172 Y 0.06304767
## 2 3554 Y 0.06505186
## 3 92 Y 0.06846652
## 4 3076 Y 0.07028159
## 5 4775 Y 0.07340651
## 6 6354 Y 0.07402876
## Popular.fctr.All.X..rcv.glmnet Popular.fctr.All.X..rcv.glmnet.err
## 1 N TRUE
## 2 N TRUE
## 3 N TRUE
## 4 N TRUE
## 5 N TRUE
## 6 N TRUE
## Popular.fctr.All.X..rcv.glmnet.err.abs
## 1 0.9369523
## 2 0.9349481
## 3 0.9315335
## 4 0.9297184
## 5 0.9265935
## 6 0.9259712
## Popular.fctr.All.X..rcv.glmnet.is.acc
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## Popular.fctr.All.X..rcv.glmnet.accurate
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## Popular.fctr.All.X..rcv.glmnet.error
## 1 -0.03695233
## 2 -0.03494814
## 3 -0.03153348
## 4 -0.02971841
## 5 -0.02659349
## 6 -0.02597124
## UniqueID Popular.fctr Popular.fctr.All.X..rcv.glmnet.prob
## 157 711 N 0.1109795
## 334 2997 N 0.1371334
## 363 1783 N 0.1431908
## 398 3198 N 0.1520313
## 402 322 N 0.1531120
## 638 483 N 0.7466027
## Popular.fctr.All.X..rcv.glmnet Popular.fctr.All.X..rcv.glmnet.err
## 157 Y TRUE
## 334 Y TRUE
## 363 Y TRUE
## 398 Y TRUE
## 402 Y TRUE
## 638 Y TRUE
## Popular.fctr.All.X..rcv.glmnet.err.abs
## 157 0.1109795
## 334 0.1371334
## 363 0.1431908
## 398 0.1520313
## 402 0.1531120
## 638 0.7466027
## Popular.fctr.All.X..rcv.glmnet.is.acc
## 157 FALSE
## 334 FALSE
## 363 FALSE
## 398 FALSE
## 402 FALSE
## 638 FALSE
## Popular.fctr.All.X..rcv.glmnet.accurate
## 157 FALSE
## 334 FALSE
## 363 FALSE
## 398 FALSE
## 402 FALSE
## 638 FALSE
## Popular.fctr.All.X..rcv.glmnet.error
## 157 0.01097955
## 334 0.03713335
## 363 0.04319079
## 398 0.05203135
## 402 0.05311195
## 638 0.64660268
## UniqueID Popular.fctr Popular.fctr.All.X..rcv.glmnet.prob
## 644 4943 N 0.7797207
## 645 221 N 0.7836057
## 646 472 N 0.7920216
## 647 1448 N 0.8059659
## 648 3590 N 0.8061825
## 649 2995 N 0.8105748
## Popular.fctr.All.X..rcv.glmnet Popular.fctr.All.X..rcv.glmnet.err
## 644 Y TRUE
## 645 Y TRUE
## 646 Y TRUE
## 647 Y TRUE
## 648 Y TRUE
## 649 Y TRUE
## Popular.fctr.All.X..rcv.glmnet.err.abs
## 644 0.7797207
## 645 0.7836057
## 646 0.7920216
## 647 0.8059659
## 648 0.8061825
## 649 0.8105748
## Popular.fctr.All.X..rcv.glmnet.is.acc
## 644 FALSE
## 645 FALSE
## 646 FALSE
## 647 FALSE
## 648 FALSE
## 649 FALSE
## Popular.fctr.All.X..rcv.glmnet.accurate
## 644 FALSE
## 645 FALSE
## 646 FALSE
## 647 FALSE
## 648 FALSE
## 649 FALSE
## Popular.fctr.All.X..rcv.glmnet.error
## 644 0.6797207
## 645 0.6836057
## 646 0.6920216
## 647 0.7059659
## 648 0.7061825
## 649 0.7105748
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
if (!is.null(glbFeatsCategory)) {
glbLvlCategory <- merge(glbLvlCategory,
myget_category_stats(obs_df = glbObsFit, mdl_id = glb_sel_mdl_id,
label = "fit"),
by = glbFeatsCategory, all = TRUE)
row.names(glbLvlCategory) <- glbLvlCategory[, glbFeatsCategory]
glbLvlCategory <- merge(glbLvlCategory,
myget_category_stats(obs_df = glbObsOOB, mdl_id = glb_sel_mdl_id,
label="OOB"),
#by=glbFeatsCategory, all=TRUE) glb_ctgry-df already contains .n.OOB ?
all = TRUE)
row.names(glbLvlCategory) <- glbLvlCategory[, glbFeatsCategory]
if (any(grepl("OOB", glbMdlMetricsEval)))
print(orderBy(~-err.abs.OOB.mean, glbLvlCategory)) else
print(orderBy(~-err.abs.fit.mean, glbLvlCategory))
print(colSums(glbLvlCategory[, -grep(glbFeatsCategory, names(glbLvlCategory))]))
}
## NDSSName.my.fctr
## OpEd#Opinion# OpEd#Opinion#
## #Opinion#ThePublicEditor #Opinion#ThePublicEditor
## Styles#U.S.# Styles#U.S.#
## Business#Crosswords/Games# Business#Crosswords/Games#
## Science#Health# Science#Health#
## Business#Technology# Business#Technology#
## ## ##
## Business#BusinessDay#Dealbook Business#BusinessDay#Dealbook
## Metro#N.Y./Region# Metro#N.Y./Region#
## Culture#Arts# Culture#Arts#
## #Opinion#RoomForDebate #Opinion#RoomForDebate
## Styles##Fashion Styles##Fashion
## Business#BusinessDay#SmallBusiness Business#BusinessDay#SmallBusiness
## myOther myOther
## Travel#Travel# Travel#Travel#
## Culture## Culture##
## Foreign#World#AsiaPacific Foreign#World#AsiaPacific
## #Multimedia# #Multimedia#
## TStyle## TStyle##
## #U.S.#Education #U.S.#Education
## Foreign#World# Foreign#World#
## .n.OOB .n.Fit .n.Tst .freqRatio.Fit
## OpEd#Opinion# 89 437 164 0.090965862
## #Opinion#ThePublicEditor 4 16 10 0.003330558
## Styles#U.S.# 50 127 61 0.026436303
## Business#Crosswords/Games# 18 105 42 0.021856786
## Science#Health# 48 148 57 0.030807660
## Business#Technology# 126 213 114 0.044338052
## ## 371 913 342 0.190049958
## Business#BusinessDay#Dealbook 323 629 304 0.130932556
## Metro#N.Y./Region# 70 128 67 0.026644463
## Culture#Arts# 185 490 174 0.101998335
## #Opinion#RoomForDebate 20 42 20 0.008742714
## Styles##Fashion 15 104 15 0.021648626
## Business#BusinessDay#SmallBusiness 40 100 41 0.020815987
## myOther 5 33 5 0.006869276
## Travel#Travel# 34 83 35 0.017277269
## Culture## 1 NA 70 NA
## Foreign#World#AsiaPacific 53 150 56 0.031223980
## #Multimedia# 49 92 52 0.019150708
## TStyle## 101 623 105 0.129683597
## #U.S.#Education 82 243 89 0.050582848
## Foreign#World# 44 128 47 0.026644463
## .freqRatio.OOB .freqRatio.Tst
## OpEd#Opinion# 0.0515046296 0.087700535
## #Opinion#ThePublicEditor 0.0023148148 0.005347594
## Styles#U.S.# 0.0289351852 0.032620321
## Business#Crosswords/Games# 0.0104166667 0.022459893
## Science#Health# 0.0277777778 0.030481283
## Business#Technology# 0.0729166667 0.060962567
## ## 0.2146990741 0.182887701
## Business#BusinessDay#Dealbook 0.1869212963 0.162566845
## Metro#N.Y./Region# 0.0405092593 0.035828877
## Culture#Arts# 0.1070601852 0.093048128
## #Opinion#RoomForDebate 0.0115740741 0.010695187
## Styles##Fashion 0.0086805556 0.008021390
## Business#BusinessDay#SmallBusiness 0.0231481481 0.021925134
## myOther 0.0028935185 0.002673797
## Travel#Travel# 0.0196759259 0.018716578
## Culture## 0.0005787037 0.037433155
## Foreign#World#AsiaPacific 0.0306712963 0.029946524
## #Multimedia# 0.0283564815 0.027807487
## TStyle## 0.0584490741 0.056149733
## #U.S.#Education 0.0474537037 0.047593583
## Foreign#World# 0.0254629630 0.025133690
## err.abs.fit.sum err.abs.fit.mean .n.fit
## OpEd#Opinion# 169.357052 0.38754474 437
## #Opinion#ThePublicEditor 7.057990 0.44112438 16
## Styles#U.S.# 62.316357 0.49067998 127
## Business#Crosswords/Games# 37.686969 0.35892352 105
## Science#Health# 67.078814 0.45323523 148
## Business#Technology# 46.021544 0.21606359 213
## ## 132.789459 0.14544300 913
## Business#BusinessDay#Dealbook 96.786571 0.15387372 629
## Metro#N.Y./Region# 19.897531 0.15544946 128
## Culture#Arts# 60.243557 0.12294604 490
## #Opinion#RoomForDebate 6.536618 0.15563377 42
## Styles##Fashion 8.952975 0.08608630 104
## Business#BusinessDay#SmallBusiness 13.037114 0.13037114 100
## myOther 3.720547 0.11274386 33
## Travel#Travel# 6.797123 0.08189305 83
## Culture## NA NA NA
## Foreign#World#AsiaPacific 15.547113 0.10364742 150
## #Multimedia# 8.344712 0.09070339 92
## TStyle## 43.563105 0.06992473 623
## #U.S.#Education 15.463174 0.06363446 243
## Foreign#World# 8.936229 0.06981429 128
## err.abs.OOB.sum err.abs.OOB.mean
## OpEd#Opinion# 46.7314088 0.52507201
## #Opinion#ThePublicEditor 1.9510549 0.48776373
## Styles#U.S.# 23.6200366 0.47240073
## Business#Crosswords/Games# 8.3838865 0.46577147
## Science#Health# 22.1985061 0.46246888
## Business#Technology# 29.9693251 0.23785179
## ## 78.0190757 0.21029400
## Business#BusinessDay#Dealbook 66.1823064 0.20489878
## Metro#N.Y./Region# 13.5214374 0.19316339
## Culture#Arts# 34.6698010 0.18740433
## #Opinion#RoomForDebate 3.7374924 0.18687462
## Styles##Fashion 2.1595216 0.14396810
## Business#BusinessDay#SmallBusiness 5.6273209 0.14068302
## myOther 0.5675341 0.11350681
## Travel#Travel# 3.6253187 0.10662702
## Culture## 0.1028114 0.10281135
## Foreign#World#AsiaPacific 5.3378398 0.10071396
## #Multimedia# 4.8367383 0.09870894
## TStyle## 9.5082891 0.09414148
## #U.S.#Education 6.0162144 0.07336847
## Foreign#World# 3.1957814 0.07263139
## .n.OOB .n.Fit .n.Tst .freqRatio.Fit
## 1728.000000 NA 1870.000000 NA
## .freqRatio.OOB .freqRatio.Tst err.abs.fit.sum err.abs.fit.mean
## 1.000000 1.000000 NA NA
## .n.fit err.abs.OOB.sum err.abs.OOB.mean
## NA 369.961700 4.681124
write.csv(glbObsOOB[, c(glb_id_var,
grep(glb_rsp_var, names(glbObsOOB), fixed=TRUE, value=TRUE))],
paste0(gsub(".", "_", paste0(glb_out_pfx, glb_sel_mdl_id), fixed=TRUE),
"_OOBobs.csv"), row.names=FALSE)
fit.models_2_chunk_df <-
myadd_chunk(NULL, "fit.models_2_bgn", label.minor = "teardown")
## label step_major step_minor label_minor bgn end elapsed
## 1 fit.models_2_bgn 1 0 teardown 442.674 NA NA
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor label_minor bgn end elapsed
## 12 fit.models 6 2 2 416.565 442.684 26.119
## 13 fit.models 6 3 3 442.685 NA NA
# if (sum(is.na(glbObsAll$D.P.http)) > 0)
# stop("fit.models_3: Why is this happening ?")
#stop(here"); glb2Sav()
sync_glb_obs_df <- function() {
# Merge or cbind ?
for (col in setdiff(names(glbObsFit), names(glbObsTrn)))
glbObsTrn[glbObsTrn$.lcn == "Fit", col] <<- glbObsFit[, col]
for (col in setdiff(names(glbObsFit), names(glbObsAll)))
glbObsAll[glbObsAll$.lcn == "Fit", col] <<- glbObsFit[, col]
if (all(is.na(glbObsNew[, glb_rsp_var])))
for (col in setdiff(names(glbObsOOB), names(glbObsTrn)))
glbObsTrn[glbObsTrn$.lcn == "OOB", col] <<- glbObsOOB[, col]
for (col in setdiff(names(glbObsOOB), names(glbObsAll)))
glbObsAll[glbObsAll$.lcn == "OOB", col] <<- glbObsOOB[, col]
}
sync_glb_obs_df()
print(setdiff(names(glbObsNew), names(glbObsAll)))
## character(0)
if (glb_save_envir)
save(glb_feats_df,
glbObsAll, #glbObsTrn, glbObsFit, glbObsOOB, glbObsNew,
glb_models_df, dsp_models_df, glb_models_lst, glb_sel_mdl, glb_sel_mdl_id,
glb_model_type,
file=paste0(glb_out_pfx, "selmdl_dsk.RData"))
#load(paste0(glb_out_pfx, "selmdl_dsk.RData"))
rm(ret_lst)
## Warning in rm(ret_lst): object 'ret_lst' not found
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"model.selected")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc=TRUE)
## label step_major step_minor label_minor bgn end
## 13 fit.models 6 3 3 442.685 449.661
## 14 fit.data.training 7 0 0 449.661 NA
## elapsed
## 13 6.976
## 14 NA
7.0: fit data training#load(paste0(glb_inp_pfx, "dsk.RData"))
#stop(here"); glb2Sav()
if (!is.null(glb_fin_mdl_id) && (glb_fin_mdl_id %in% names(glb_models_lst))) {
warning("Final model same as user selected model")
glb_fin_mdl <- glb_models_lst[[glb_fin_mdl_id]]
} else
# if (nrow(glbObsFit) + length(glbObsFitOutliers) == nrow(glbObsTrn))
if (!all(is.na(glbObsNew[, glb_rsp_var])))
{
warning("Final model same as glb_sel_mdl_id")
glb_fin_mdl_id <- paste0("Final.", glb_sel_mdl_id)
glb_fin_mdl <- glb_sel_mdl
glb_models_lst[[glb_fin_mdl_id]] <- glb_fin_mdl
} else {
# if (grepl("RFE", glb_sel_mdl_id) ||
# (!is.null(glb_mdl_ensemble) && grepl("RFE", glb_mdl_ensemble))) {
indep_vars <- myadjust_interaction_feats(subset(glb_feats_df,
!nzv & (exclude.as.feat != 1))[, "id"])
rfe_trn_results <-
myrun_rfe(glbObsTrn, indep_vars, glbRFESizes[["Final"]])
if (!isTRUE(all.equal(sort(predictors(rfe_trn_results)),
sort(predictors(rfe_fit_results))))) {
print("Diffs predictors(rfe_trn_results) vs. predictors(rfe_fit_results):")
print(setdiff(predictors(rfe_trn_results), predictors(rfe_fit_results)))
print("Diffs predictors(rfe_fit_results) vs. predictors(rfe_trn_results):")
print(setdiff(predictors(rfe_fit_results), predictors(rfe_trn_results)))
}
# }
if (grepl("Ensemble", glb_sel_mdl_id)) {
# Find which models are relevant
mdlimp_df <- subset(myget_feats_importance(glb_sel_mdl), imp > 5)
# Fit selected models on glbObsTrn
for (mdl_id in gsub(".prob", "",
gsub(mygetPredictIds(glb_rsp_var)$value, "", row.names(mdlimp_df), fixed = TRUE),
fixed = TRUE)) {
mdl_id_components <- unlist(strsplit(mdl_id, "[.]"))
mdlIdPfx <- paste0(c(head(mdl_id_components, -1), "Train"),
collapse = ".")
if (grepl("RFE\\.X\\.", mdlIdPfx))
mdlIndepVars <- myadjust_interaction_feats(myextract_actual_feats(
predictors(rfe_trn_results))) else
mdlIndepVars <- trim(unlist(
strsplit(glb_models_df[glb_models_df$id == mdl_id, "feats"], "[,]")))
ret_lst <-
myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = mdlIdPfx,
type = glb_model_type, tune.df = glb_tune_models_df,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = tail(mdl_id_components, 1))),
indep_vars = mdlIndepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsTrn, OOB_df = NULL)
glbObsTrn <- glb_get_predictions(df = glbObsTrn,
mdl_id = tail(glb_models_df$id, 1),
rsp_var = glb_rsp_var,
prob_threshold_def =
subset(glb_models_df, id == mdl_id)$opt.prob.threshold.OOB)
glbObsNew <- glb_get_predictions(df = glbObsNew,
mdl_id = tail(glb_models_df$id, 1),
rsp_var = glb_rsp_var,
prob_threshold_def =
subset(glb_models_df, id == mdl_id)$opt.prob.threshold.OOB)
}
}
# "Final" model
if ((model_method <- glb_sel_mdl$method) == "custom")
# get actual method from the mdl_id
model_method <- tail(unlist(strsplit(glb_sel_mdl_id, "[.]")), 1)
if (grepl("Ensemble", glb_sel_mdl_id)) {
# Find which models are relevant
mdlimp_df <- subset(myget_feats_importance(glb_sel_mdl), imp > 5)
if (glb_is_classification && glb_is_binomial)
indep_vars_vctr <- gsub("(.*)\\.(.*)\\.prob", "\\1\\.Train\\.\\2\\.prob",
row.names(mdlimp_df)) else
indep_vars_vctr <- gsub("(.*)\\.(.*)", "\\1\\.Train\\.\\2",
row.names(mdlimp_df))
} else
if (grepl("RFE.X", glb_sel_mdl_id, fixed = TRUE)) {
indep_vars_vctr <- myextract_actual_feats(predictors(rfe_trn_results))
} else indep_vars_vctr <-
trim(unlist(strsplit(glb_models_df[glb_models_df$id ==
glb_sel_mdl_id
, "feats"], "[,]")))
if (!is.null(glb_preproc_methods) &&
((match_pos <- regexpr(gsub(".", "\\.",
paste(glb_preproc_methods, collapse = "|"),
fixed = TRUE), glb_sel_mdl_id)) != -1))
ths_preProcess <- str_sub(glb_sel_mdl_id, match_pos,
match_pos + attr(match_pos, "match.length") - 1) else
ths_preProcess <- NULL
mdl_id_pfx <- ifelse(grepl("Ensemble", glb_sel_mdl_id),
"Final.Ensemble", "Final")
trnobs_df <- if (is.null(glbObsTrnOutliers[[mdl_id_pfx]])) glbObsTrn else
glbObsTrn[!(glbObsTrn[, glb_id_var] %in%
glbObsTrnOutliers[[mdl_id_pfx]]), ]
# Force fitting of Final.glm to identify outliers
#method_vctr <- unique(c("glm", myparseMdlId(glb_sel_mdl_id)$alg))
# or skip glm for speed
method_vctr <- myparseMdlId(glb_sel_mdl_id)$alg
for (method in method_vctr) {
#source("caret_nominalTrainWorkflow.R")
# glmnet requires at least 2 indep vars
if ((length(indep_vars_vctr) == 1) && (method %in% "glmnet"))
next
ret_lst <-
myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = mdl_id_pfx,
type = glb_model_type, trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = if (method %in% c("glm", "glmnet")) FALSE else TRUE,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = method,
train.preProcess = ths_preProcess)),
indep_vars = indep_vars_vctr, rsp_var = glb_rsp_var,
fit_df = trnobs_df, OOB_df = NULL)
}
if ((length(method_vctr) == 1) || (method != "glm")) {
glb_fin_mdl <- glb_models_lst[[length(glb_models_lst)]]
glb_fin_mdl_id <- glb_models_df[length(glb_models_lst), "id"]
}
}
## +(rfe) fit Fold1.Rep1 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold1.Rep1 size: 60
## +(rfe) imp Fold1.Rep1
## -(rfe) imp Fold1.Rep1
## +(rfe) fit Fold1.Rep1 size: 32
## -(rfe) fit Fold1.Rep1 size: 32
## +(rfe) fit Fold1.Rep1 size: 16
## -(rfe) fit Fold1.Rep1 size: 16
## +(rfe) fit Fold1.Rep1 size: 8
## -(rfe) fit Fold1.Rep1 size: 8
## +(rfe) fit Fold1.Rep1 size: 4
## -(rfe) fit Fold1.Rep1 size: 4
## +(rfe) fit Fold1.Rep1 size: 2
## -(rfe) fit Fold1.Rep1 size: 2
## +(rfe) fit Fold2.Rep1 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold2.Rep1 size: 60
## +(rfe) imp Fold2.Rep1
## -(rfe) imp Fold2.Rep1
## +(rfe) fit Fold2.Rep1 size: 32
## -(rfe) fit Fold2.Rep1 size: 32
## +(rfe) fit Fold2.Rep1 size: 16
## -(rfe) fit Fold2.Rep1 size: 16
## +(rfe) fit Fold2.Rep1 size: 8
## -(rfe) fit Fold2.Rep1 size: 8
## +(rfe) fit Fold2.Rep1 size: 4
## -(rfe) fit Fold2.Rep1 size: 4
## +(rfe) fit Fold2.Rep1 size: 2
## -(rfe) fit Fold2.Rep1 size: 2
## +(rfe) fit Fold3.Rep1 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold3.Rep1 size: 60
## +(rfe) imp Fold3.Rep1
## -(rfe) imp Fold3.Rep1
## +(rfe) fit Fold3.Rep1 size: 32
## -(rfe) fit Fold3.Rep1 size: 32
## +(rfe) fit Fold3.Rep1 size: 16
## -(rfe) fit Fold3.Rep1 size: 16
## +(rfe) fit Fold3.Rep1 size: 8
## -(rfe) fit Fold3.Rep1 size: 8
## +(rfe) fit Fold3.Rep1 size: 4
## -(rfe) fit Fold3.Rep1 size: 4
## +(rfe) fit Fold3.Rep1 size: 2
## -(rfe) fit Fold3.Rep1 size: 2
## +(rfe) fit Fold1.Rep2 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold1.Rep2 size: 60
## +(rfe) imp Fold1.Rep2
## -(rfe) imp Fold1.Rep2
## +(rfe) fit Fold1.Rep2 size: 32
## -(rfe) fit Fold1.Rep2 size: 32
## +(rfe) fit Fold1.Rep2 size: 16
## -(rfe) fit Fold1.Rep2 size: 16
## +(rfe) fit Fold1.Rep2 size: 8
## -(rfe) fit Fold1.Rep2 size: 8
## +(rfe) fit Fold1.Rep2 size: 4
## -(rfe) fit Fold1.Rep2 size: 4
## +(rfe) fit Fold1.Rep2 size: 2
## -(rfe) fit Fold1.Rep2 size: 2
## +(rfe) fit Fold2.Rep2 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold2.Rep2 size: 60
## +(rfe) imp Fold2.Rep2
## -(rfe) imp Fold2.Rep2
## +(rfe) fit Fold2.Rep2 size: 32
## -(rfe) fit Fold2.Rep2 size: 32
## +(rfe) fit Fold2.Rep2 size: 16
## -(rfe) fit Fold2.Rep2 size: 16
## +(rfe) fit Fold2.Rep2 size: 8
## -(rfe) fit Fold2.Rep2 size: 8
## +(rfe) fit Fold2.Rep2 size: 4
## -(rfe) fit Fold2.Rep2 size: 4
## +(rfe) fit Fold2.Rep2 size: 2
## -(rfe) fit Fold2.Rep2 size: 2
## +(rfe) fit Fold3.Rep2 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold3.Rep2 size: 60
## +(rfe) imp Fold3.Rep2
## -(rfe) imp Fold3.Rep2
## +(rfe) fit Fold3.Rep2 size: 32
## -(rfe) fit Fold3.Rep2 size: 32
## +(rfe) fit Fold3.Rep2 size: 16
## -(rfe) fit Fold3.Rep2 size: 16
## +(rfe) fit Fold3.Rep2 size: 8
## -(rfe) fit Fold3.Rep2 size: 8
## +(rfe) fit Fold3.Rep2 size: 4
## -(rfe) fit Fold3.Rep2 size: 4
## +(rfe) fit Fold3.Rep2 size: 2
## -(rfe) fit Fold3.Rep2 size: 2
## +(rfe) fit Fold1.Rep3 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold1.Rep3 size: 60
## +(rfe) imp Fold1.Rep3
## -(rfe) imp Fold1.Rep3
## +(rfe) fit Fold1.Rep3 size: 32
## -(rfe) fit Fold1.Rep3 size: 32
## +(rfe) fit Fold1.Rep3 size: 16
## -(rfe) fit Fold1.Rep3 size: 16
## +(rfe) fit Fold1.Rep3 size: 8
## -(rfe) fit Fold1.Rep3 size: 8
## +(rfe) fit Fold1.Rep3 size: 4
## -(rfe) fit Fold1.Rep3 size: 4
## +(rfe) fit Fold1.Rep3 size: 2
## -(rfe) fit Fold1.Rep3 size: 2
## +(rfe) fit Fold2.Rep3 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold2.Rep3 size: 60
## +(rfe) imp Fold2.Rep3
## -(rfe) imp Fold2.Rep3
## +(rfe) fit Fold2.Rep3 size: 32
## -(rfe) fit Fold2.Rep3 size: 32
## +(rfe) fit Fold2.Rep3 size: 16
## -(rfe) fit Fold2.Rep3 size: 16
## +(rfe) fit Fold2.Rep3 size: 8
## -(rfe) fit Fold2.Rep3 size: 8
## +(rfe) fit Fold2.Rep3 size: 4
## -(rfe) fit Fold2.Rep3 size: 4
## +(rfe) fit Fold2.Rep3 size: 2
## -(rfe) fit Fold2.Rep3 size: 2
## +(rfe) fit Fold3.Rep3 size: 60
## Warning in lda.default(x, grouping, ...): variables are collinear
## -(rfe) fit Fold3.Rep3 size: 60
## +(rfe) imp Fold3.Rep3
## -(rfe) imp Fold3.Rep3
## +(rfe) fit Fold3.Rep3 size: 32
## -(rfe) fit Fold3.Rep3 size: 32
## +(rfe) fit Fold3.Rep3 size: 16
## -(rfe) fit Fold3.Rep3 size: 16
## +(rfe) fit Fold3.Rep3 size: 8
## -(rfe) fit Fold3.Rep3 size: 8
## +(rfe) fit Fold3.Rep3 size: 4
## -(rfe) fit Fold3.Rep3 size: 4
## +(rfe) fit Fold3.Rep3 size: 2
## -(rfe) fit Fold3.Rep3 size: 2
## Warning in lda.default(x, grouping, ...): variables are collinear
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (3 fold, repeated 3 times)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 2 0.8204 0.03718 0.003445 0.01035
## 4 0.8760 0.44502 0.003122 0.01919
## 8 0.8738 0.44420 0.003283 0.02061
## 16 0.9016 0.63787 0.006552 0.02352
## 32 0.9013 0.63732 0.006547 0.02350
## 60 0.9029 0.64607 0.006800 0.02399 *
##
## The top 5 variables (out of 60):
## WordCount.log1p, WordCount.root2, WordCount.nexp, NDSSName.my.fctrOpEd#Opinion#, PubDate.day.minutes.poly.1
##
## [1] "WordCount.log1p"
## [2] "WordCount.root2"
## [3] "WordCount.nexp"
## [4] "NDSSName.my.fctrOpEd#Opinion#"
## [5] "PubDate.day.minutes.poly.1"
## [6] "PubDate.day.minutes.poly.4"
## [7] "PubDate.hour.fctr(15.3,23]"
## [8] "PubDate.last4.log1p"
## [9] "PubDate.last2.log1p"
## [10] "NDSSName.my.fctrScience#Health#"
## [11] "NDSSName.my.fctrBusiness#Crosswords/Games#"
## [12] "PubDate.day.minutes.poly.5"
## [13] "PubDate.last8.log1p"
## [14] "NDSSName.my.fctrStyles#U.S.#"
## [15] "PubDate.wkend"
## [16] "PubDate.last16.log1p"
## [17] "NDSSName.my.fctr##:PubDate.day.minutes.poly.2.ctg"
## [18] "PubDate.day.minutes.poly.2"
## [19] "PubDate.juliandate"
## [20] "PubDate.wkday.fctr6"
## [21] "PubDate.month.fctr11"
## [22] "PubDate.second.fctr(14.8,29.5]"
## [23] "PubDate.date.fctr(7,13]"
## [24] ".rnorm"
## [25] "NDSSName.my.fctr##:PubDate.day.minutes.poly.5.ctg"
## [26] "PubDate.wkday.fctr1"
## [27] "PubDate.day.minutes.poly.3"
## [28] "NDSSName.my.fctr##:PubDate.day.minutes.poly.3.ctg"
## [29] "PubDate.date.fctr(25,31]"
## [30] "PubDate.last32.log1p"
## [31] "PubDate.hour.fctr(7.67,15.3]"
## [32] "NDSSName.my.fctr##:PubDate.day.minutes.poly.1.ctg"
## [33] "PubDate.minute.fctr(14.8,29.5]"
## [34] "NDSSName.my.fctr##:PubDate.day.minutes.poly.4.ctg"
## [35] "PubDate.month.fctr10"
## [36] "NDSSName.my.fctrBusiness#Technology#"
## [37] "NDSSName.my.fctrmyOther"
## [38] "PubDate.wkday.fctr3"
## [39] "PubDate.date.fctr(13,19]"
## [40] "PubDate.second.fctr(29.5,44.2]"
## [41] "PubDate.minute.fctr(44.2,59.1]"
## [42] "PubDate.wkday.fctr4"
## [43] "PubDate.second.fctr(44.2,59.1]"
## [44] "NDSSName.my.fctr#Opinion#RoomForDebate"
## [45] "PubDate.date.fctr(19,25]"
## [46] "NDSSName.my.fctrMetro#N.Y./Region#"
## [47] "NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness"
## [48] "NDSSName.my.fctrTravel#Travel#"
## [49] "NDSSName.my.fctrStyles##Fashion"
## [50] "NDSSName.my.fctr#Multimedia#"
## [51] "PubDate.wkday.fctr2"
## [52] "NDSSName.my.fctrForeign#World#"
## [53] "NDSSName.my.fctrForeign#World#AsiaPacific"
## [54] "PubDate.wkday.fctr5"
## [55] "PubDate.minute.fctr(29.5,44.2]"
## [56] "NDSSName.my.fctr#U.S.#Education"
## [57] "NDSSName.my.fctrCulture#Arts#"
## [58] "NDSSName.my.fctrBusiness#BusinessDay#Dealbook"
## [59] "NDSSName.my.fctr##"
## [60] "NDSSName.my.fctrTStyle##"
## [1] "fitting model: Final##rcv#glmnet"
## [1] " indep_vars: WordCount.root2,WordCount.log1p,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.hour.fctr,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last2.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,PubDate.juliandate,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg"
## + Fold1.Rep1: alpha=0.100, lambda=0.07781
## - Fold1.Rep1: alpha=0.100, lambda=0.07781
## + Fold1.Rep1: alpha=0.325, lambda=0.07781
## - Fold1.Rep1: alpha=0.325, lambda=0.07781
## + Fold1.Rep1: alpha=0.550, lambda=0.07781
## - Fold1.Rep1: alpha=0.550, lambda=0.07781
## + Fold1.Rep1: alpha=0.775, lambda=0.07781
## - Fold1.Rep1: alpha=0.775, lambda=0.07781
## + Fold1.Rep1: alpha=1.000, lambda=0.07781
## - Fold1.Rep1: alpha=1.000, lambda=0.07781
## + Fold2.Rep1: alpha=0.100, lambda=0.07781
## - Fold2.Rep1: alpha=0.100, lambda=0.07781
## + Fold2.Rep1: alpha=0.325, lambda=0.07781
## - Fold2.Rep1: alpha=0.325, lambda=0.07781
## + Fold2.Rep1: alpha=0.550, lambda=0.07781
## - Fold2.Rep1: alpha=0.550, lambda=0.07781
## + Fold2.Rep1: alpha=0.775, lambda=0.07781
## - Fold2.Rep1: alpha=0.775, lambda=0.07781
## + Fold2.Rep1: alpha=1.000, lambda=0.07781
## - Fold2.Rep1: alpha=1.000, lambda=0.07781
## + Fold3.Rep1: alpha=0.100, lambda=0.07781
## - Fold3.Rep1: alpha=0.100, lambda=0.07781
## + Fold3.Rep1: alpha=0.325, lambda=0.07781
## - Fold3.Rep1: alpha=0.325, lambda=0.07781
## + Fold3.Rep1: alpha=0.550, lambda=0.07781
## - Fold3.Rep1: alpha=0.550, lambda=0.07781
## + Fold3.Rep1: alpha=0.775, lambda=0.07781
## - Fold3.Rep1: alpha=0.775, lambda=0.07781
## + Fold3.Rep1: alpha=1.000, lambda=0.07781
## - Fold3.Rep1: alpha=1.000, lambda=0.07781
## + Fold1.Rep2: alpha=0.100, lambda=0.07781
## - Fold1.Rep2: alpha=0.100, lambda=0.07781
## + Fold1.Rep2: alpha=0.325, lambda=0.07781
## - Fold1.Rep2: alpha=0.325, lambda=0.07781
## + Fold1.Rep2: alpha=0.550, lambda=0.07781
## - Fold1.Rep2: alpha=0.550, lambda=0.07781
## + Fold1.Rep2: alpha=0.775, lambda=0.07781
## - Fold1.Rep2: alpha=0.775, lambda=0.07781
## + Fold1.Rep2: alpha=1.000, lambda=0.07781
## - Fold1.Rep2: alpha=1.000, lambda=0.07781
## + Fold2.Rep2: alpha=0.100, lambda=0.07781
## - Fold2.Rep2: alpha=0.100, lambda=0.07781
## + Fold2.Rep2: alpha=0.325, lambda=0.07781
## - Fold2.Rep2: alpha=0.325, lambda=0.07781
## + Fold2.Rep2: alpha=0.550, lambda=0.07781
## - Fold2.Rep2: alpha=0.550, lambda=0.07781
## + Fold2.Rep2: alpha=0.775, lambda=0.07781
## - Fold2.Rep2: alpha=0.775, lambda=0.07781
## + Fold2.Rep2: alpha=1.000, lambda=0.07781
## - Fold2.Rep2: alpha=1.000, lambda=0.07781
## + Fold3.Rep2: alpha=0.100, lambda=0.07781
## - Fold3.Rep2: alpha=0.100, lambda=0.07781
## + Fold3.Rep2: alpha=0.325, lambda=0.07781
## - Fold3.Rep2: alpha=0.325, lambda=0.07781
## + Fold3.Rep2: alpha=0.550, lambda=0.07781
## - Fold3.Rep2: alpha=0.550, lambda=0.07781
## + Fold3.Rep2: alpha=0.775, lambda=0.07781
## - Fold3.Rep2: alpha=0.775, lambda=0.07781
## + Fold3.Rep2: alpha=1.000, lambda=0.07781
## - Fold3.Rep2: alpha=1.000, lambda=0.07781
## + Fold1.Rep3: alpha=0.100, lambda=0.07781
## - Fold1.Rep3: alpha=0.100, lambda=0.07781
## + Fold1.Rep3: alpha=0.325, lambda=0.07781
## - Fold1.Rep3: alpha=0.325, lambda=0.07781
## + Fold1.Rep3: alpha=0.550, lambda=0.07781
## - Fold1.Rep3: alpha=0.550, lambda=0.07781
## + Fold1.Rep3: alpha=0.775, lambda=0.07781
## - Fold1.Rep3: alpha=0.775, lambda=0.07781
## + Fold1.Rep3: alpha=1.000, lambda=0.07781
## - Fold1.Rep3: alpha=1.000, lambda=0.07781
## + Fold2.Rep3: alpha=0.100, lambda=0.07781
## - Fold2.Rep3: alpha=0.100, lambda=0.07781
## + Fold2.Rep3: alpha=0.325, lambda=0.07781
## - Fold2.Rep3: alpha=0.325, lambda=0.07781
## + Fold2.Rep3: alpha=0.550, lambda=0.07781
## - Fold2.Rep3: alpha=0.550, lambda=0.07781
## + Fold2.Rep3: alpha=0.775, lambda=0.07781
## - Fold2.Rep3: alpha=0.775, lambda=0.07781
## + Fold2.Rep3: alpha=1.000, lambda=0.07781
## - Fold2.Rep3: alpha=1.000, lambda=0.07781
## + Fold3.Rep3: alpha=0.100, lambda=0.07781
## - Fold3.Rep3: alpha=0.100, lambda=0.07781
## + Fold3.Rep3: alpha=0.325, lambda=0.07781
## - Fold3.Rep3: alpha=0.325, lambda=0.07781
## + Fold3.Rep3: alpha=0.550, lambda=0.07781
## - Fold3.Rep3: alpha=0.550, lambda=0.07781
## + Fold3.Rep3: alpha=0.775, lambda=0.07781
## - Fold3.Rep3: alpha=0.775, lambda=0.07781
## + Fold3.Rep3: alpha=1.000, lambda=0.07781
## - Fold3.Rep3: alpha=1.000, lambda=0.07781
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.55, lambda = 0.0168 on full training set
## Length Class Mode
## a0 100 -none- numeric
## beta 16200 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 162 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -4.78104098
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.15716647
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 2.50088062
## NDSSName.my.fctr#U.S.#Education
## -0.33434263
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.02894109
## NDSSName.my.fctrBusiness#Technology#
## 0.19883102
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.36547613
## NDSSName.my.fctrOpEd#Opinion#
## 3.34643876
## NDSSName.my.fctrScience#Health#
## 2.47151545
## NDSSName.my.fctrStyles##Fashion
## -0.00339643
## NDSSName.my.fctrStyles#U.S.#
## 2.23387632
## NDSSName.my.fctrTStyle##
## -0.45726935
## PubDate.day.minutes.poly.1
## 10.36813376
## PubDate.day.minutes.poly.2
## 6.33879536
## PubDate.day.minutes.poly.4
## 4.04939718
## PubDate.wkend
## 0.27219117
## WordCount.log1p
## 0.22027445
## WordCount.root2
## 0.04497217
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.2.ctg
## -5.72182542
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg
## 1.34345995
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -4.88031295
## NDSSName.my.fctr#Opinion#RoomForDebate
## -1.28068147
## NDSSName.my.fctr#Opinion#ThePublicEditor
## 2.58795738
## NDSSName.my.fctr#U.S.#Education
## -0.41050213
## NDSSName.my.fctrBusiness#Crosswords/Games#
## 3.06553773
## NDSSName.my.fctrBusiness#Technology#
## 0.24453644
## NDSSName.my.fctrForeign#World#AsiaPacific
## -0.43656196
## NDSSName.my.fctrOpEd#Opinion#
## 3.39324898
## NDSSName.my.fctrScience#Health#
## 2.50984370
## NDSSName.my.fctrStyles##Fashion
## -0.07571194
## NDSSName.my.fctrStyles#U.S.#
## 2.27571593
## NDSSName.my.fctrTStyle##
## -0.50494153
## PubDate.day.minutes.poly.1
## 10.73014392
## PubDate.day.minutes.poly.2
## 7.19255801
## PubDate.day.minutes.poly.4
## 4.41523107
## PubDate.wkend
## 0.29479163
## WordCount.log1p
## 0.23019703
## WordCount.root2
## 0.04594506
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.2.ctg
## -7.91528672
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg
## 2.10344796
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.4.ctg
## 0.11241578
## Prediction
## Reference N Y
## N 5147 292
## Y 331 762
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.046234e-01 6.527832e-01 8.972418e-01 9.116414e-01 8.326699e-01
## AccuracyPValue McnemarPValue
## 8.863659e-63 1.278994e-01
## id
## 1 Final##rcv#glmnet
## feats
## 1 WordCount.root2,WordCount.log1p,NDSSName.my.fctr,PubDate.day.minutes.poly.1,PubDate.hour.fctr,PubDate.wkend,PubDate.day.minutes.poly.4,PubDate.day.minutes.poly.2,PubDate.last4.log1p,PubDate.last2.log1p,PubDate.last8.log1p,PubDate.last16.log1p,PubDate.day.minutes.poly.3,PubDate.month.fctr,PubDate.juliandate,.rnorm,PubDate.last32.log1p,PubDate.date.fctr,PubDate.second.fctr,PubDate.minute.fctr,PubDate.wkday.fctr,WordCount.nexp,PubDate.day.minutes.poly.5,NDSSName.my.fctr:PubDate.day.minutes.poly.3.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.4.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.5.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.2.ctg,NDSSName.my.fctr:PubDate.day.minutes.poly.1.ctg
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 114.206 3.96
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.7936937 0.9643317 0.6230558 0.9303324
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.3 0.7098277 0.9060012
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.8972418 0.9116414 0.6325805
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.00415203 0.01980279
rm(ret_lst)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc=FALSE)
## label step_major step_minor label_minor bgn end
## 14 fit.data.training 7 0 0 449.661 595.943
## 15 fit.data.training 7 1 1 595.944 NA
## elapsed
## 14 146.282
## 15 NA
#stop(here"); glb2Sav()
if (glb_is_classification && glb_is_binomial)
prob_threshold <- glb_models_df[glb_models_df$id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"] else
prob_threshold <- NULL
if (grepl("Ensemble", glb_fin_mdl_id)) {
# Get predictions for each model in ensemble; Outliers that have been moved to OOB might not have been predicted yet
mdlEnsembleComps <- unlist(str_split(subset(glb_models_df,
id == glb_fin_mdl_id)$feats, ","))
if (glb_is_classification && glb_is_binomial)
mdlEnsembleComps <- gsub("\\.prob$", "", mdlEnsembleComps)
mdlEnsembleComps <- gsub(paste0("^",
gsub(".", "\\.", mygetPredictIds(glb_rsp_var)$value, fixed = TRUE)),
"", mdlEnsembleComps)
for (mdl_id in mdlEnsembleComps) {
glbObsTrn <- glb_get_predictions(df = glbObsTrn, mdl_id = mdl_id,
rsp_var = glb_rsp_var,
prob_threshold_def = prob_threshold)
glbObsNew <- glb_get_predictions(df = glbObsNew, mdl_id = mdl_id,
rsp_var = glb_rsp_var,
prob_threshold_def = prob_threshold)
}
}
glbObsTrn <- glb_get_predictions(df = glbObsTrn, mdl_id = glb_fin_mdl_id,
rsp_var = glb_rsp_var,
prob_threshold_def = prob_threshold)
## Warning in glb_get_predictions(df = glbObsTrn, mdl_id = glb_fin_mdl_id, :
## Using default probability threshold: 0.1
glb_featsimp_df <- myget_feats_importance(mdl=glb_fin_mdl,
featsimp_df=glb_featsimp_df)
glb_featsimp_df[, paste0(glb_fin_mdl_id, ".imp")] <- glb_featsimp_df$imp
print(glb_featsimp_df)
## All.X##rcv#glmnet.imp
## PubDate.day.minutes.poly.1 100.00000
## PubDate.day.minutes.poly.2 43.63891
## PubDate.day.minutes.poly.4 59.18904
## NDSSName.my.fctrOpEd#Opinion# 47.21133
## NDSSName.my.fctrBusiness#Crosswords/Games# 45.38051
## NDSSName.my.fctr#Opinion#ThePublicEditor 43.52674
## NDSSName.my.fctrScience#Health# 43.69957
## NDSSName.my.fctrStyles#U.S.# 42.53063
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg 63.88597
## PubDate.wkend 30.50180
## NDSSName.my.fctrBusiness#Technology# 29.35337
## WordCount.log1p 30.43597
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.4.ctg 33.86620
## WordCount.root2 29.52473
## .rnorm 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctr#Multimedia# 28.98875
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook 28.13044
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness 28.28957
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrCulture## 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrCulture#Arts# 28.06442
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrForeign#World# 28.20781
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrMetro#N.Y./Region# 29.35337
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.5.ctg 41.77929
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.1.ctg 24.23150
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.3.ctg 35.10334
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.1.ctg 34.04441
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.2.ctg 29.59778
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrTravel#Travel# 28.38092
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrmyOther 29.23532
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.5.ctg 29.35337
## PubDate.date.fctr(13,19] 29.35337
## PubDate.date.fctr(19,25] 29.35337
## PubDate.date.fctr(25,31] 29.35337
## PubDate.date.fctr(7,13] 29.35337
## PubDate.day.minutes.poly.3 29.35337
## PubDate.day.minutes.poly.5 29.35337
## PubDate.hour.fctr(15.3,23] 29.64299
## PubDate.hour.fctr(7.67,15.3] 29.35337
## PubDate.juliandate 29.35337
## PubDate.last16.log1p 29.35337
## PubDate.last2.log1p 29.43840
## PubDate.last32.log1p 29.35337
## PubDate.last4.log1p 29.48100
## PubDate.last8.log1p 29.36106
## PubDate.minute.fctr(14.8,29.5] 29.35337
## PubDate.minute.fctr(29.5,44.2] 29.35337
## PubDate.minute.fctr(44.2,59.1] 29.35337
## PubDate.month.fctr10 29.35337
## PubDate.month.fctr11 29.35337
## PubDate.month.fctr12 29.35337
## PubDate.second.fctr(14.8,29.5] 29.35337
## PubDate.second.fctr(29.5,44.2] 29.35337
## PubDate.second.fctr(44.2,59.1] 29.35337
## PubDate.wkday.fctr1 29.35337
## PubDate.wkday.fctr2 29.35337
## PubDate.wkday.fctr3 29.35337
## PubDate.wkday.fctr4 29.35337
## PubDate.wkday.fctr5 29.35337
## PubDate.wkday.fctr6 29.35337
## WordCount.nexp 29.35337
## NDSSName.my.fctrStyles##Fashion 27.37098
## NDSSName.my.fctr#U.S.#Education 27.32131
## NDSSName.my.fctrForeign#World#AsiaPacific 27.02315
## NDSSName.my.fctrTStyle## 26.26230
## NDSSName.my.fctr#Opinion#RoomForDebate 24.63959
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.2.ctg 0.00000
## imp
## PubDate.day.minutes.poly.1 100.00000
## PubDate.day.minutes.poly.2 80.90424
## PubDate.day.minutes.poly.4 66.02225
## NDSSName.my.fctrOpEd#Opinion# 60.56361
## NDSSName.my.fctrBusiness#Crosswords/Games# 58.80165
## NDSSName.my.fctr#Opinion#ThePublicEditor 56.22569
## NDSSName.my.fctrScience#Health# 55.81153
## NDSSName.my.fctrStyles#U.S.# 54.55137
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg 53.53417
## PubDate.wkend 43.89547
## NDSSName.my.fctrBusiness#Technology# 43.62216
## WordCount.log1p 43.54951
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.4.ctg 42.90290
## WordCount.root2 42.55927
## .rnorm 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctr#Multimedia# 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrCulture## 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrCulture#Arts# 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrForeign#World# 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region# 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrTravel#Travel# 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrmyOther 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.5.ctg 42.31219
## PubDate.date.fctr(13,19] 42.31219
## PubDate.date.fctr(19,25] 42.31219
## PubDate.date.fctr(25,31] 42.31219
## PubDate.date.fctr(7,13] 42.31219
## PubDate.day.minutes.poly.3 42.31219
## PubDate.day.minutes.poly.5 42.31219
## PubDate.hour.fctr(15.3,23] 42.31219
## PubDate.hour.fctr(7.67,15.3] 42.31219
## PubDate.juliandate 42.31219
## PubDate.last16.log1p 42.31219
## PubDate.last2.log1p 42.31219
## PubDate.last32.log1p 42.31219
## PubDate.last4.log1p 42.31219
## PubDate.last8.log1p 42.31219
## PubDate.minute.fctr(14.8,29.5] 42.31219
## PubDate.minute.fctr(29.5,44.2] 42.31219
## PubDate.minute.fctr(44.2,59.1] 42.31219
## PubDate.month.fctr10 42.31219
## PubDate.month.fctr11 42.31219
## PubDate.month.fctr12 42.31219
## PubDate.second.fctr(14.8,29.5] 42.31219
## PubDate.second.fctr(29.5,44.2] 42.31219
## PubDate.second.fctr(44.2,59.1] 42.31219
## PubDate.wkday.fctr1 42.31219
## PubDate.wkday.fctr2 42.31219
## PubDate.wkday.fctr3 42.31219
## PubDate.wkday.fctr4 42.31219
## PubDate.wkday.fctr5 42.31219
## PubDate.wkday.fctr6 42.31219
## WordCount.nexp 42.31219
## NDSSName.my.fctrStyles##Fashion 41.91391
## NDSSName.my.fctr#U.S.#Education 40.11307
## NDSSName.my.fctrForeign#World#AsiaPacific 39.97221
## NDSSName.my.fctrTStyle## 39.60135
## NDSSName.my.fctr#Opinion#RoomForDebate 35.43704
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.2.ctg 0.00000
## Final##rcv#glmnet.imp
## PubDate.day.minutes.poly.1 100.00000
## PubDate.day.minutes.poly.2 80.90424
## PubDate.day.minutes.poly.4 66.02225
## NDSSName.my.fctrOpEd#Opinion# 60.56361
## NDSSName.my.fctrBusiness#Crosswords/Games# 58.80165
## NDSSName.my.fctr#Opinion#ThePublicEditor 56.22569
## NDSSName.my.fctrScience#Health# 55.81153
## NDSSName.my.fctrStyles#U.S.# 54.55137
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg 53.53417
## PubDate.wkend 43.89547
## NDSSName.my.fctrBusiness#Technology# 43.62216
## WordCount.log1p 43.54951
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.4.ctg 42.90290
## WordCount.root2 42.55927
## .rnorm 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctr#Multimedia# 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrCulture## 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrCulture#Arts# 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrForeign#World# 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region# 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrTravel#Travel# 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrmyOther 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.5.ctg 42.31219
## PubDate.date.fctr(13,19] 42.31219
## PubDate.date.fctr(19,25] 42.31219
## PubDate.date.fctr(25,31] 42.31219
## PubDate.date.fctr(7,13] 42.31219
## PubDate.day.minutes.poly.3 42.31219
## PubDate.day.minutes.poly.5 42.31219
## PubDate.hour.fctr(15.3,23] 42.31219
## PubDate.hour.fctr(7.67,15.3] 42.31219
## PubDate.juliandate 42.31219
## PubDate.last16.log1p 42.31219
## PubDate.last2.log1p 42.31219
## PubDate.last32.log1p 42.31219
## PubDate.last4.log1p 42.31219
## PubDate.last8.log1p 42.31219
## PubDate.minute.fctr(14.8,29.5] 42.31219
## PubDate.minute.fctr(29.5,44.2] 42.31219
## PubDate.minute.fctr(44.2,59.1] 42.31219
## PubDate.month.fctr10 42.31219
## PubDate.month.fctr11 42.31219
## PubDate.month.fctr12 42.31219
## PubDate.second.fctr(14.8,29.5] 42.31219
## PubDate.second.fctr(29.5,44.2] 42.31219
## PubDate.second.fctr(44.2,59.1] 42.31219
## PubDate.wkday.fctr1 42.31219
## PubDate.wkday.fctr2 42.31219
## PubDate.wkday.fctr3 42.31219
## PubDate.wkday.fctr4 42.31219
## PubDate.wkday.fctr5 42.31219
## PubDate.wkday.fctr6 42.31219
## WordCount.nexp 42.31219
## NDSSName.my.fctrStyles##Fashion 41.91391
## NDSSName.my.fctr#U.S.#Education 40.11307
## NDSSName.my.fctrForeign#World#AsiaPacific 39.97221
## NDSSName.my.fctrTStyle## 39.60135
## NDSSName.my.fctr#Opinion#RoomForDebate 35.43704
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.2.ctg 0.00000
if (glb_is_classification && glb_is_binomial)
glb_analytics_diag_plots(obs_df=glbObsTrn, mdl_id=glb_fin_mdl_id,
prob_threshold=glb_models_df[glb_models_df$id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"]) else
glb_analytics_diag_plots(obs_df=glbObsTrn, mdl_id=glb_fin_mdl_id)
## Warning in glb_analytics_diag_plots(obs_df = glbObsTrn, mdl_id =
## glb_fin_mdl_id, : Limiting important feature scatter plots to 5 out of 28
## [1] "Min/Max Boundaries: "
## UniqueID Popular.fctr Popular.fctr.All.X..rcv.glmnet.prob
## 1 1065 N NA
## 2 4168 N 0.04112848
## 3 5647 N 0.12994825
## 4 302 N NA
## Popular.fctr.All.X..rcv.glmnet Popular.fctr.All.X..rcv.glmnet.err
## 1 <NA> NA
## 2 N FALSE
## 3 Y TRUE
## 4 <NA> NA
## Popular.fctr.All.X..rcv.glmnet.err.abs
## 1 NA
## 2 0.04112848
## 3 0.12994825
## 4 NA
## Popular.fctr.All.X..rcv.glmnet.is.acc
## 1 NA
## 2 TRUE
## 3 FALSE
## 4 NA
## Popular.fctr.Final..rcv.glmnet.prob Popular.fctr.Final..rcv.glmnet
## 1 0.06949234 N
## 2 0.01820329 N
## 3 0.13697814 Y
## 4 0.27593024 Y
## Popular.fctr.Final..rcv.glmnet.err
## 1 FALSE
## 2 FALSE
## 3 TRUE
## 4 TRUE
## Popular.fctr.Final..rcv.glmnet.err.abs
## 1 0.06949234
## 2 0.01820329
## 3 0.13697814
## 4 0.27593024
## Popular.fctr.Final..rcv.glmnet.is.acc
## 1 TRUE
## 2 TRUE
## 3 FALSE
## 4 FALSE
## Popular.fctr.Final..rcv.glmnet.accurate
## 1 TRUE
## 2 TRUE
## 3 FALSE
## 4 FALSE
## Popular.fctr.Final..rcv.glmnet.error .label
## 1 0.00000000 1065
## 2 0.00000000 4168
## 3 0.03697814 5647
## 4 0.17593024 302
## [1] "Inaccurate: "
## UniqueID Popular.fctr Popular.fctr.All.X..rcv.glmnet.prob
## 1 2182 Y 0.04445092
## 2 1696 Y 0.07013375
## 3 4020 Y NA
## 4 364 Y 0.06962056
## 5 4775 Y NA
## 6 6354 Y NA
## Popular.fctr.All.X..rcv.glmnet Popular.fctr.All.X..rcv.glmnet.err
## 1 N TRUE
## 2 N TRUE
## 3 <NA> NA
## 4 N TRUE
## 5 <NA> NA
## 6 <NA> NA
## Popular.fctr.All.X..rcv.glmnet.err.abs
## 1 0.9555491
## 2 0.9298663
## 3 NA
## 4 0.9303794
## 5 NA
## 6 NA
## Popular.fctr.All.X..rcv.glmnet.is.acc
## 1 FALSE
## 2 FALSE
## 3 NA
## 4 FALSE
## 5 NA
## 6 NA
## Popular.fctr.Final..rcv.glmnet.prob Popular.fctr.Final..rcv.glmnet
## 1 0.01967665 N
## 2 0.02216048 N
## 3 0.03107144 N
## 4 0.03325074 N
## 5 0.03363740 N
## 6 0.03411410 N
## Popular.fctr.Final..rcv.glmnet.err
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
## Popular.fctr.Final..rcv.glmnet.err.abs
## 1 0.9803233
## 2 0.9778395
## 3 0.9689286
## 4 0.9667493
## 5 0.9663626
## 6 0.9658859
## Popular.fctr.Final..rcv.glmnet.is.acc
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## Popular.fctr.Final..rcv.glmnet.accurate
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## Popular.fctr.Final..rcv.glmnet.error
## 1 -0.08032335
## 2 -0.07783952
## 3 -0.06892856
## 4 -0.06674926
## 5 -0.06636260
## 6 -0.06588590
## UniqueID Popular.fctr Popular.fctr.All.X..rcv.glmnet.prob
## 219 4335 N NA
## 254 5109 N 0.1324561
## 439 4532 N NA
## 474 4651 N 0.1515244
## 1044 5053 N NA
## 1275 5563 N 0.5782044
## Popular.fctr.All.X..rcv.glmnet Popular.fctr.All.X..rcv.glmnet.err
## 219 <NA> NA
## 254 Y TRUE
## 439 <NA> NA
## 474 Y TRUE
## 1044 <NA> NA
## 1275 Y TRUE
## Popular.fctr.All.X..rcv.glmnet.err.abs
## 219 NA
## 254 0.1324561
## 439 NA
## 474 0.1515244
## 1044 NA
## 1275 0.5782044
## Popular.fctr.All.X..rcv.glmnet.is.acc
## 219 NA
## 254 FALSE
## 439 NA
## 474 FALSE
## 1044 NA
## 1275 FALSE
## Popular.fctr.Final..rcv.glmnet.prob Popular.fctr.Final..rcv.glmnet
## 219 0.1082135 Y
## 254 0.1099989 Y
## 439 0.1220536 Y
## 474 0.1247365 Y
## 1044 0.2466501 Y
## 1275 0.6406628 Y
## Popular.fctr.Final..rcv.glmnet.err
## 219 TRUE
## 254 TRUE
## 439 TRUE
## 474 TRUE
## 1044 TRUE
## 1275 TRUE
## Popular.fctr.Final..rcv.glmnet.err.abs
## 219 0.1082135
## 254 0.1099989
## 439 0.1220536
## 474 0.1247365
## 1044 0.2466501
## 1275 0.6406628
## Popular.fctr.Final..rcv.glmnet.is.acc
## 219 FALSE
## 254 FALSE
## 439 FALSE
## 474 FALSE
## 1044 FALSE
## 1275 FALSE
## Popular.fctr.Final..rcv.glmnet.accurate
## 219 FALSE
## 254 FALSE
## 439 FALSE
## 474 FALSE
## 1044 FALSE
## 1275 FALSE
## Popular.fctr.Final..rcv.glmnet.error
## 219 0.008213543
## 254 0.009998861
## 439 0.022053646
## 474 0.024736548
## 1044 0.146650091
## 1275 0.540662768
## UniqueID Popular.fctr Popular.fctr.All.X..rcv.glmnet.prob
## 1365 221 N NA
## 1366 472 N NA
## 1367 1612 N 0.7987708
## 1368 1448 N NA
## 1369 3590 N NA
## 1370 2995 N NA
## Popular.fctr.All.X..rcv.glmnet Popular.fctr.All.X..rcv.glmnet.err
## 1365 <NA> NA
## 1366 <NA> NA
## 1367 Y TRUE
## 1368 <NA> NA
## 1369 <NA> NA
## 1370 <NA> NA
## Popular.fctr.All.X..rcv.glmnet.err.abs
## 1365 NA
## 1366 NA
## 1367 0.7987708
## 1368 NA
## 1369 NA
## 1370 NA
## Popular.fctr.All.X..rcv.glmnet.is.acc
## 1365 NA
## 1366 NA
## 1367 FALSE
## 1368 NA
## 1369 NA
## 1370 NA
## Popular.fctr.Final..rcv.glmnet.prob Popular.fctr.Final..rcv.glmnet
## 1365 0.9107409 Y
## 1366 0.9142664 Y
## 1367 0.9156595 Y
## 1368 0.9159299 Y
## 1369 0.9184685 Y
## 1370 0.9217662 Y
## Popular.fctr.Final..rcv.glmnet.err
## 1365 TRUE
## 1366 TRUE
## 1367 TRUE
## 1368 TRUE
## 1369 TRUE
## 1370 TRUE
## Popular.fctr.Final..rcv.glmnet.err.abs
## 1365 0.9107409
## 1366 0.9142664
## 1367 0.9156595
## 1368 0.9159299
## 1369 0.9184685
## 1370 0.9217662
## Popular.fctr.Final..rcv.glmnet.is.acc
## 1365 FALSE
## 1366 FALSE
## 1367 FALSE
## 1368 FALSE
## 1369 FALSE
## 1370 FALSE
## Popular.fctr.Final..rcv.glmnet.accurate
## 1365 FALSE
## 1366 FALSE
## 1367 FALSE
## 1368 FALSE
## 1369 FALSE
## 1370 FALSE
## Popular.fctr.Final..rcv.glmnet.error
## 1365 0.8107409
## 1366 0.8142664
## 1367 0.8156595
## 1368 0.8159299
## 1369 0.8184685
## 1370 0.8217662
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
dsp_feats_vctr <- c(NULL)
for(var in grep(".imp", names(glb_feats_df), fixed=TRUE, value=TRUE))
dsp_feats_vctr <- union(dsp_feats_vctr,
glb_feats_df[!is.na(glb_feats_df[, var]), "id"])
# print(glbObsTrn[glbObsTrn$UniqueID %in% FN_OOB_ids,
# grep(glb_rsp_var, names(glbObsTrn), value=TRUE)])
print(setdiff(names(glbObsTrn), names(glbObsAll)))
## [1] "Popular.fctr.Final..rcv.glmnet.prob"
## [2] "Popular.fctr.Final..rcv.glmnet"
## [3] "Popular.fctr.Final..rcv.glmnet.err"
## [4] "Popular.fctr.Final..rcv.glmnet.err.abs"
## [5] "Popular.fctr.Final..rcv.glmnet.is.acc"
for (col in setdiff(names(glbObsTrn), names(glbObsAll)))
# Merge or cbind ?
glbObsAll[glbObsAll$.src == "Train", col] <- glbObsTrn[, col]
print(setdiff(names(glbObsFit), names(glbObsAll)))
## character(0)
print(setdiff(names(glbObsOOB), names(glbObsAll)))
## character(0)
for (col in setdiff(names(glbObsOOB), names(glbObsAll)))
# Merge or cbind ?
glbObsAll[glbObsAll$.lcn == "OOB", col] <- glbObsOOB[, col]
print(setdiff(names(glbObsNew), names(glbObsAll)))
## character(0)
if (glb_save_envir)
save(glb_feats_df, glbObsAll,
#glbObsTrn, glbObsFit, glbObsOOB, glbObsNew,
glb_models_df, dsp_models_df, glb_models_lst, glb_model_type,
glb_sel_mdl, glb_sel_mdl_id,
glb_fin_mdl, glb_fin_mdl_id,
file=paste0(glb_out_pfx, "dsk.RData"))
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.training.all.prediction","model.final")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
## 3.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: data.training.all.prediction
## 4.0000 5 0 1 1 1
## 4.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: model.final
## 5.0000 4 0 0 2 1
glb_chunks_df <- myadd_chunk(glb_chunks_df, "predict.data.new", major.inc=TRUE)
## label step_major step_minor label_minor bgn end
## 15 fit.data.training 7 1 1 595.944 610.616
## 16 predict.data.new 8 0 0 610.617 NA
## elapsed
## 15 14.672
## 16 NA
8.0: predict data new## Warning in glb_get_predictions(obs_df, mdl_id = glb_fin_mdl_id, rsp_var =
## glb_rsp_var, : Using default probability threshold: 0.1
## Warning in glb_get_predictions(obs_df, mdl_id = glb_fin_mdl_id, rsp_var =
## glb_rsp_var, : Using default probability threshold: 0.1
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Warning in zoo(rval[i], index(x)[i]): some methods for "zoo" objects do not
## work if the index entries in 'order.by' are not unique
## Warning in glb_analytics_diag_plots(obs_df = glbObsNew, mdl_id =
## glb_fin_mdl_id, : Limiting important feature scatter plots to 5 out of 28
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## Warning: Removed 1870 rows containing missing values (geom_point).
## NULL
## Loading required package: stringr
## [1] "ObsNew Prediction errors in categories:"
## NDSSName.my.fctr .n.Trn.N .n.Trn.Y .n.New.N .n.New.Y
## 5 #U.S.#Education 325 NA 83 6
## 10 Culture## 1 NA 51 19
## 12 Foreign#World# 172 NA 46 1
## 21 myOther 38 NA 4 1
## .n.Trn.N .n.Trn.Y .n.New.N .n.New.Y
## 536 0 184 27
## Loading required package: tidyr
##
## Attaching package: 'tidyr'
##
## The following object is masked from 'package:Matrix':
##
## expand
## [1] "OOBobs Popular.fctr.All.X..rcv.glmnet Y: min < min of Train range: 10"
## UniqueID Popular.fctr.All.X..rcv.glmnet
## 1631 1631 Y
## 5431 5431 Y
## 1906 1906 Y
## 3872 3872 Y
## 2645 2645 Y
## 6435 6435 Y
## 1767 1767 Y
## 1923 1923 Y
## 4223 4223 Y
## 1930 1930 Y
## PubDate.day.minutes.poly.1.ctg PubDate.day.minutes.poly.2
## 1631 0.04035041 0.002185492
## 5431 0.18988524 -0.003298637
## 1906 -0.28422208 -0.007994015
## 3872 0.01079276 -0.008744657
## 2645 0.04444564 0.012594250
## 6435 -0.02734665 0.002105741
## 1767 -0.02245774 -0.004384985
## 1923 -0.06338396 -0.008641599
## 4223 -0.04543407 -0.008758791
## 1930 0.02392413 -0.007891784
## PubDate.day.minutes.poly.2.ctg PubDate.day.minutes.poly.4
## 1631 -0.164542461 -0.011653637
## 5431 0.160017937 -0.004425843
## 1906 0.189926123 0.006563830
## 3872 -0.104973596 0.009341415
## 2645 0.035996050 -0.018240126
## 6435 -0.017879840 -0.010140008
## 1767 -0.201374531 -0.002340509
## 1923 -0.156983579 0.009616977
## 4223 -0.006162687 0.009518864
## 1930 -0.159715310 0.008326544
## PubDate.day.minutes.poly.4.ctg PubDate.day.minutes.poly.5.ctg
## 1631 0.08825344 0.18274711
## 5431 -0.40164220 -0.54313226
## 1906 -0.30249370 0.41093522
## 3872 -0.51296095 -0.45711886
## 2645 -0.01082616 -0.03079946
## 6435 0.02178124 -0.02033356
## 1767 0.33305455 -0.13083307
## 1923 0.21319110 -0.10732959
## 4223 -0.02181201 -0.01124082
## 1930 0.19588297 0.13994102
## WordCount.log1p WordCount.root2
## 1631 5.598422 16.40122
## 5431 7.295056 38.36665
## 1906 7.160846 35.87478
## 3872 6.647688 27.74887
## 2645 6.635947 27.58623
## 6435 0.000000 0.00000
## 1767 6.979145 32.75668
## 1923 6.142037 21.54066
## 4223 7.274480 37.97368
## 1930 5.541264 15.93738
## id cor.y
## PubDate.day.minutes.poly.1.ctg PubDate.day.minutes.poly.1.ctg -0.002432289
## PubDate.day.minutes.poly.2 PubDate.day.minutes.poly.2 0.070977720
## PubDate.day.minutes.poly.2.ctg PubDate.day.minutes.poly.2.ctg 0.003596414
## PubDate.day.minutes.poly.4 PubDate.day.minutes.poly.4 0.073941394
## PubDate.day.minutes.poly.4.ctg PubDate.day.minutes.poly.4.ctg 0.014601521
## PubDate.day.minutes.poly.5.ctg PubDate.day.minutes.poly.5.ctg 0.014574775
## WordCount.log1p WordCount.log1p 0.254319628
## WordCount.root2 WordCount.root2 0.292120679
## exclude.as.feat cor.y.abs cor.high.X
## PubDate.day.minutes.poly.1.ctg FALSE 0.002432289 <NA>
## PubDate.day.minutes.poly.2 FALSE 0.070977720 <NA>
## PubDate.day.minutes.poly.2.ctg FALSE 0.003596414 <NA>
## PubDate.day.minutes.poly.4 FALSE 0.073941394 <NA>
## PubDate.day.minutes.poly.4.ctg FALSE 0.014601521 <NA>
## PubDate.day.minutes.poly.5.ctg FALSE 0.014574775 <NA>
## WordCount.log1p FALSE 0.254319628 WordCount.root2
## WordCount.root2 FALSE 0.292120679 <NA>
## freqRatio percentUnique zeroVar nzv
## PubDate.day.minutes.poly.1.ctg 1.083333 53.96509 FALSE FALSE
## PubDate.day.minutes.poly.2 1.225490 18.08022 FALSE FALSE
## PubDate.day.minutes.poly.2.ctg 1.083333 53.94979 FALSE FALSE
## PubDate.day.minutes.poly.4 1.225490 18.08022 FALSE FALSE
## PubDate.day.minutes.poly.4.ctg 1.083333 53.94979 FALSE FALSE
## PubDate.day.minutes.poly.5.ctg 1.083333 53.94979 FALSE FALSE
## WordCount.log1p 2.315789 24.15799 FALSE FALSE
## WordCount.root2 2.315789 24.15799 FALSE FALSE
## is.cor.y.abs.low interaction.feat
## PubDate.day.minutes.poly.1.ctg TRUE NDSSName.my.fctr
## PubDate.day.minutes.poly.2 FALSE <NA>
## PubDate.day.minutes.poly.2.ctg TRUE NDSSName.my.fctr
## PubDate.day.minutes.poly.4 FALSE <NA>
## PubDate.day.minutes.poly.4.ctg FALSE NDSSName.my.fctr
## PubDate.day.minutes.poly.5.ctg FALSE NDSSName.my.fctr
## WordCount.log1p FALSE <NA>
## WordCount.root2 FALSE <NA>
## shapiro.test.p.value rsp_var_raw id_var
## PubDate.day.minutes.poly.1.ctg 1.051535e-45 FALSE NA
## PubDate.day.minutes.poly.2 8.020999e-64 FALSE NA
## PubDate.day.minutes.poly.2.ctg 2.302769e-65 FALSE NA
## PubDate.day.minutes.poly.4 1.523136e-47 FALSE NA
## PubDate.day.minutes.poly.4.ctg 2.214419e-67 FALSE NA
## PubDate.day.minutes.poly.5.ctg 7.171204e-67 FALSE NA
## WordCount.log1p 1.576866e-49 FALSE NA
## WordCount.root2 4.556481e-30 FALSE NA
## rsp_var max min
## PubDate.day.minutes.poly.1.ctg NA 0.48127714 -0.707011442
## PubDate.day.minutes.poly.2 NA 0.04268445 -0.008758791
## PubDate.day.minutes.poly.2.ctg NA 0.75539456 -0.221260607
## PubDate.day.minutes.poly.4 NA 0.06677441 -0.018327397
## PubDate.day.minutes.poly.4.ctg NA 0.67700049 -0.611884133
## PubDate.day.minutes.poly.5.ctg NA 0.56286316 -0.716534449
## WordCount.log1p NA 9.29771002 0.000000000
## WordCount.root2 NA 104.46051886 0.000000000
## max.Popular.fctr.N max.Popular.fctr.Y
## PubDate.day.minutes.poly.1.ctg 0.35968907 0.32950245
## PubDate.day.minutes.poly.2 0.04268445 0.04254377
## PubDate.day.minutes.poly.2.ctg 0.75539456 0.43056671
## PubDate.day.minutes.poly.4 0.06543120 0.06149053
## PubDate.day.minutes.poly.4.ctg 0.67700049 0.28961875
## PubDate.day.minutes.poly.5.ctg 0.56286316 0.21585241
## WordCount.log1p 8.81966535 9.29771002
## WordCount.root2 82.24962006 104.46051886
## min.Popular.fctr.N min.Popular.fctr.Y
## PubDate.day.minutes.poly.1.ctg -0.399776908 -0.245703803
## PubDate.day.minutes.poly.2 -0.008758791 -0.008758717
## PubDate.day.minutes.poly.2.ctg -0.221260607 -0.155122711
## PubDate.day.minutes.poly.4 -0.018327397 -0.018219595
## PubDate.day.minutes.poly.4.ctg -0.611884133 -0.282432189
## PubDate.day.minutes.poly.5.ctg -0.716534449 -0.370586479
## WordCount.log1p 0.000000000 1.945910149
## WordCount.root2 0.000000000 2.449489743
## max.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg 0.48127714
## PubDate.day.minutes.poly.2 0.04268445
## PubDate.day.minutes.poly.2.ctg 0.74739148
## PubDate.day.minutes.poly.4 0.06213811
## PubDate.day.minutes.poly.4.ctg 0.49508199
## PubDate.day.minutes.poly.5.ctg 0.48962874
## WordCount.log1p 7.05961763
## WordCount.root2 34.10278581
## max.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg 0.19773100
## PubDate.day.minutes.poly.2 0.04254377
## PubDate.day.minutes.poly.2.ctg 0.35042637
## PubDate.day.minutes.poly.4 0.06610094
## PubDate.day.minutes.poly.4.ctg 0.45727441
## PubDate.day.minutes.poly.5.ctg 0.41093522
## WordCount.log1p 9.14088311
## WordCount.root2 96.58157174
## min.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg -0.707011442
## PubDate.day.minutes.poly.2 -0.008758672
## PubDate.day.minutes.poly.2.ctg -0.201374531
## PubDate.day.minutes.poly.4 -0.018326850
## PubDate.day.minutes.poly.4.ctg -0.163870979
## PubDate.day.minutes.poly.5.ctg -0.211418410
## WordCount.log1p 0.000000000
## WordCount.root2 0.000000000
## min.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg -0.284222080
## PubDate.day.minutes.poly.2 -0.008758791
## PubDate.day.minutes.poly.2.ctg -0.201374531
## PubDate.day.minutes.poly.4 -0.018240126
## PubDate.day.minutes.poly.4.ctg -0.512960949
## PubDate.day.minutes.poly.5.ctg -0.543132262
## WordCount.log1p 0.000000000
## WordCount.root2 0.000000000
## max.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg 0.22609552
## PubDate.day.minutes.poly.2 0.04254377
## PubDate.day.minutes.poly.2.ctg 0.60416557
## PubDate.day.minutes.poly.4 0.05340046
## PubDate.day.minutes.poly.4.ctg 0.63819571
## PubDate.day.minutes.poly.5.ctg 0.45824974
## WordCount.log1p 7.61332498
## WordCount.root2 44.98888752
## max.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg 0.19641586
## PubDate.day.minutes.poly.2 0.04268445
## PubDate.day.minutes.poly.2.ctg 0.33756882
## PubDate.day.minutes.poly.4 0.06677441
## PubDate.day.minutes.poly.4.ctg 0.38235412
## PubDate.day.minutes.poly.5.ctg 0.42244492
## WordCount.log1p 8.69232228
## WordCount.root2 77.17512553
## min.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg -0.254654849
## PubDate.day.minutes.poly.2 -0.008758791
## PubDate.day.minutes.poly.2.ctg -0.191639985
## PubDate.day.minutes.poly.4 -0.018322678
## PubDate.day.minutes.poly.4.ctg -0.239606422
## PubDate.day.minutes.poly.5.ctg -0.354757272
## WordCount.log1p 0.000000000
## WordCount.root2 0.000000000
## min.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg -0.253130167
## PubDate.day.minutes.poly.2 -0.008758672
## PubDate.day.minutes.poly.2.ctg -0.189941446
## PubDate.day.minutes.poly.4 -0.018203392
## PubDate.day.minutes.poly.4.ctg -0.205244731
## PubDate.day.minutes.poly.5.ctg -0.280963223
## WordCount.log1p 1.609437912
## WordCount.root2 2.000000000
## [1] "OOBobs Popular.fctr.All.X..rcv.glmnet Y: max > max of Train range: 16"
## UniqueID Popular.fctr.All.X..rcv.glmnet PubDate.day.minutes.poly.1
## 1908 1908 Y 0.001882238
## 1922 1922 Y -0.002039521
## 5233 5233 Y -0.016600865
## 6528 6528 Y 0.001809613
## 1627 1627 Y 0.012231324
## 1906 1906 Y 0.002281677
## 302 302 Y 0.024722851
## 3770 3770 Y 0.001918551
## 4466 4466 Y 0.005586121
## 6435 6435 Y -0.013151170
## 1767 1767 Y 0.006784437
## 1923 1923 Y -0.002221084
## 1928 1928 Y -0.003709899
## 6517 6517 Y 0.020474279
## 3205 3205 Y 0.002898990
## 6521 6521 Y 0.013901702
## PubDate.day.minutes.poly.3 PubDate.day.minutes.poly.3.ctg
## 1908 -0.004571013 0.13906448
## 1922 0.001236262 -0.15016859
## 5233 0.007145007 0.01482341
## 6528 -0.004472838 -0.03222702
## 1627 -0.007282868 -0.13040699
## 1906 -0.005100600 0.03046261
## 302 0.051829879 0.01497441
## 3770 -0.004619890 0.43176828
## 4466 -0.008578107 0.03634546
## 6435 0.010843928 0.01771752
## 1767 -0.009309113 0.02489965
## 1923 0.001515129 0.08106078
## 1928 0.003766018 -0.12101585
## 6517 0.020825101 -0.01019734
## 3205 -0.005881110 0.01996895
## 6521 -0.004412350 -0.01394151
## PubDate.day.minutes.poly.4 PubDate.day.minutes.poly.4.ctg
## 1908 0.0071228891 -0.23421325
## 1922 0.0096415269 -0.09688160
## 5233 -0.0170009782 0.02271875
## 6528 0.0072191411 -0.01196576
## 1627 -0.0128746495 -0.11643110
## 1906 0.0065638297 -0.30249370
## 302 0.0661009370 0.01396653
## 3770 0.0070741317 0.45727441
## 4466 0.0003590042 0.11223265
## 6435 -0.0101400077 0.02178124
## 1767 -0.0023405093 0.33305455
## 1923 0.0096169766 0.21319110
## 1928 0.0089316619 -0.23544576
## 6517 0.0101361259 0.02716643
## 3205 0.0056054933 0.01405419
## 6521 -0.0138030620 -0.03721490
## PubDate.day.minutes.poly.5 PubDate.day.minutes.poly.5.ctg
## 1908 0.006785787 0.12227512
## 1922 -0.002098247 0.04932191
## 5233 0.013803383 -0.01404809
## 6528 0.006658270 0.02271226
## 1627 -0.006115494 0.30341253
## 1906 0.007446867 0.41093522
## 302 0.083442278 -0.02109223
## 3770 0.006848725 0.38140927
## 4466 0.009641107 -0.01748172
## 6435 -0.001891593 -0.02033356
## 1767 0.008727735 -0.13083307
## 1923 -0.002547032 -0.10732959
## 1928 -0.006056430 -0.21554955
## 6517 -0.004924177 0.01414065
## 3205 0.008323368 -0.02586218
## 6521 -0.012191759 -0.02427900
## PubDate.last16.log1p PubDate.last16.log1p.ctg PubDate.last2.log1p.ctg
## 1908 8.872627 0.00000 13.672085
## 1922 9.175852 15.75212 13.934985
## 5233 11.762734 11.82663 8.116417
## 6528 11.956983 12.49349 9.209840
## 1627 8.992806 0.00000 0.000000
## 1906 8.873748 0.00000 12.444822
## 302 10.077063 11.12598 7.814400
## 3770 8.888343 13.37261 10.528838
## 4466 8.716700 13.93166 13.659964
## 6435 10.338641 12.76144 10.337832
## 1767 8.591373 13.86545 12.657569
## 1923 9.173261 15.77251 14.253507
## 1928 9.283033 15.61660 13.669639
## 6517 11.448558 12.13813 11.371500
## 3205 11.850590 12.37754 8.836665
## 6521 11.685390 12.56239 9.680219
## PubDate.last32.log1p PubDate.last8.log1p WordCount.nexp
## 1908 9.718182 8.173857 1.994412e-151
## 1922 10.010547 8.276395 0.000000e+00
## 5233 11.819748 11.622461 5.482209e-194
## 6528 12.109529 11.425547 0.000000e+00
## 1627 9.504129 8.393216 0.000000e+00
## 1906 9.729253 8.075272 0.000000e+00
## 302 10.332897 9.741557 0.000000e+00
## 3770 9.938710 8.392537 3.418239e-166
## 4466 9.313619 7.827640 2.371872e-102
## 6435 10.866967 8.695172 1.000000e+00
## 1767 9.415401 7.891331 0.000000e+00
## 1923 10.018600 8.363109 3.071570e-202
## 1928 10.027959 8.536800 0.000000e+00
## 6517 12.217912 9.753188 2.750325e-314
## 3205 12.005436 11.443361 1.026188e-10
## 6521 12.196224 10.414633 0.000000e+00
## id cor.y
## PubDate.day.minutes.poly.1 PubDate.day.minutes.poly.1 0.156753478
## PubDate.day.minutes.poly.3 PubDate.day.minutes.poly.3 0.027983551
## PubDate.day.minutes.poly.3.ctg PubDate.day.minutes.poly.3.ctg 0.014982807
## PubDate.day.minutes.poly.4 PubDate.day.minutes.poly.4 0.073941394
## PubDate.day.minutes.poly.4.ctg PubDate.day.minutes.poly.4.ctg 0.014601521
## PubDate.day.minutes.poly.5 PubDate.day.minutes.poly.5 -0.055929231
## PubDate.day.minutes.poly.5.ctg PubDate.day.minutes.poly.5.ctg 0.014574775
## PubDate.last16.log1p PubDate.last16.log1p 0.040735543
## PubDate.last16.log1p.ctg PubDate.last16.log1p.ctg 0.007783530
## PubDate.last2.log1p.ctg PubDate.last2.log1p.ctg 0.006916600
## PubDate.last32.log1p PubDate.last32.log1p 0.003558081
## PubDate.last8.log1p PubDate.last8.log1p 0.054458821
## WordCount.nexp WordCount.nexp -0.053208396
## exclude.as.feat cor.y.abs
## PubDate.day.minutes.poly.1 FALSE 0.156753478
## PubDate.day.minutes.poly.3 FALSE 0.027983551
## PubDate.day.minutes.poly.3.ctg FALSE 0.014982807
## PubDate.day.minutes.poly.4 FALSE 0.073941394
## PubDate.day.minutes.poly.4.ctg FALSE 0.014601521
## PubDate.day.minutes.poly.5 FALSE 0.055929231
## PubDate.day.minutes.poly.5.ctg FALSE 0.014574775
## PubDate.last16.log1p FALSE 0.040735543
## PubDate.last16.log1p.ctg FALSE 0.007783530
## PubDate.last2.log1p.ctg FALSE 0.006916600
## PubDate.last32.log1p FALSE 0.003558081
## PubDate.last8.log1p FALSE 0.054458821
## WordCount.nexp FALSE 0.053208396
## cor.high.X freqRatio percentUnique
## PubDate.day.minutes.poly.1 <NA> 1.225490 18.08022
## PubDate.day.minutes.poly.3 <NA> 1.225490 18.08022
## PubDate.day.minutes.poly.3.ctg <NA> 1.083333 53.96509
## PubDate.day.minutes.poly.4 <NA> 1.225490 18.08022
## PubDate.day.minutes.poly.4.ctg <NA> 1.083333 53.94979
## PubDate.day.minutes.poly.5 <NA> 1.225490 18.08022
## PubDate.day.minutes.poly.5.ctg <NA> 1.083333 53.94979
## PubDate.last16.log1p <NA> 3.200000 84.44581
## PubDate.last16.log1p.ctg <NA> 60.000000 95.17759
## PubDate.last2.log1p.ctg <NA> 5.000000 92.19228
## PubDate.last32.log1p <NA> 8.000000 90.99816
## PubDate.last8.log1p PubDate.last4.log1p 1.142857 75.12247
## WordCount.nexp <NA> 17.761364 11.32884
## zeroVar nzv is.cor.y.abs.low
## PubDate.day.minutes.poly.1 FALSE FALSE FALSE
## PubDate.day.minutes.poly.3 FALSE FALSE FALSE
## PubDate.day.minutes.poly.3.ctg FALSE FALSE FALSE
## PubDate.day.minutes.poly.4 FALSE FALSE FALSE
## PubDate.day.minutes.poly.4.ctg FALSE FALSE FALSE
## PubDate.day.minutes.poly.5 FALSE FALSE FALSE
## PubDate.day.minutes.poly.5.ctg FALSE FALSE FALSE
## PubDate.last16.log1p FALSE FALSE FALSE
## PubDate.last16.log1p.ctg FALSE FALSE TRUE
## PubDate.last2.log1p.ctg FALSE FALSE TRUE
## PubDate.last32.log1p FALSE FALSE TRUE
## PubDate.last8.log1p FALSE FALSE FALSE
## WordCount.nexp FALSE FALSE FALSE
## interaction.feat shapiro.test.p.value
## PubDate.day.minutes.poly.1 <NA> 1.590362e-18
## PubDate.day.minutes.poly.3 <NA> 9.822405e-52
## PubDate.day.minutes.poly.3.ctg NDSSName.my.fctr 1.179915e-64
## PubDate.day.minutes.poly.4 <NA> 1.523136e-47
## PubDate.day.minutes.poly.4.ctg NDSSName.my.fctr 2.214419e-67
## PubDate.day.minutes.poly.5 <NA> 1.157500e-41
## PubDate.day.minutes.poly.5.ctg NDSSName.my.fctr 7.171204e-67
## PubDate.last16.log1p <NA> 7.310334e-68
## PubDate.last16.log1p.ctg NDSSName.my.fctr 6.216597e-76
## PubDate.last2.log1p.ctg NDSSName.my.fctr 1.991089e-37
## PubDate.last32.log1p <NA> 2.783236e-77
## PubDate.last8.log1p <NA> 3.859176e-56
## WordCount.nexp <NA> 9.108805e-94
## rsp_var_raw id_var rsp_var max
## PubDate.day.minutes.poly.1 FALSE NA NA 0.02475916
## PubDate.day.minutes.poly.3 FALSE NA NA 0.05215301
## PubDate.day.minutes.poly.3.ctg FALSE NA NA 0.56127224
## PubDate.day.minutes.poly.4 FALSE NA NA 0.06677441
## PubDate.day.minutes.poly.4.ctg FALSE NA NA 0.67700049
## PubDate.day.minutes.poly.5 FALSE NA NA 0.08471756
## PubDate.day.minutes.poly.5.ctg FALSE NA NA 0.56286316
## PubDate.last16.log1p FALSE NA NA 11.95698288
## PubDate.last16.log1p.ctg FALSE NA NA 15.77251197
## PubDate.last2.log1p.ctg FALSE NA NA 15.06116892
## PubDate.last32.log1p FALSE NA NA 12.32340669
## PubDate.last8.log1p FALSE NA NA 11.62246125
## WordCount.nexp FALSE NA NA 1.00000000
## min max.Popular.fctr.N
## PubDate.day.minutes.poly.1 -0.02749464 0.02468654
## PubDate.day.minutes.poly.3 -0.04512497 0.05150779
## PubDate.day.minutes.poly.3.ctg -0.66283168 0.55528441
## PubDate.day.minutes.poly.4 -0.01832740 0.06543120
## PubDate.day.minutes.poly.4.ctg -0.61188413 0.67700049
## PubDate.day.minutes.poly.5 -0.02450918 0.08217780
## PubDate.day.minutes.poly.5.ctg -0.71653445 0.56286316
## PubDate.last16.log1p 0.00000000 11.94531808
## PubDate.last16.log1p.ctg 0.00000000 15.72030254
## PubDate.last2.log1p.ctg 0.00000000 14.72999406
## PubDate.last32.log1p 0.00000000 12.21244232
## PubDate.last8.log1p 0.00000000 11.43577441
## WordCount.nexp 0.00000000 1.00000000
## max.Popular.fctr.Y min.Popular.fctr.N
## PubDate.day.minutes.poly.1 0.024468663 -0.02749464
## PubDate.day.minutes.poly.3 0.049597025 -0.04512497
## PubDate.day.minutes.poly.3.ctg 0.363266956 -0.66283168
## PubDate.day.minutes.poly.4 0.061490534 -0.01832740
## PubDate.day.minutes.poly.4.ctg 0.289618754 -0.61188413
## PubDate.day.minutes.poly.5 0.074814724 -0.02450918
## PubDate.day.minutes.poly.5.ctg 0.215852412 -0.71653445
## PubDate.last16.log1p 11.877603300 0.00000000
## PubDate.last16.log1p.ctg 15.629535143 0.00000000
## PubDate.last2.log1p.ctg 13.653551472 0.00000000
## PubDate.last32.log1p 12.178408497 0.00000000
## PubDate.last8.log1p 11.394288315 0.00000000
## WordCount.nexp 0.002478752 0.00000000
## min.Popular.fctr.Y
## PubDate.day.minutes.poly.1 -0.02745833
## PubDate.day.minutes.poly.3 -0.04482024
## PubDate.day.minutes.poly.3.ctg -0.38280444
## PubDate.day.minutes.poly.4 -0.01821959
## PubDate.day.minutes.poly.4.ctg -0.28243219
## PubDate.day.minutes.poly.5 -0.02362780
## PubDate.day.minutes.poly.5.ctg -0.37058648
## PubDate.last16.log1p 0.00000000
## PubDate.last16.log1p.ctg 0.00000000
## PubDate.last2.log1p.ctg 0.00000000
## PubDate.last32.log1p 0.00000000
## PubDate.last8.log1p 0.00000000
## WordCount.nexp 0.00000000
## max.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1 0.02450498
## PubDate.day.minutes.poly.3 0.04991290
## PubDate.day.minutes.poly.3.ctg 0.41228572
## PubDate.day.minutes.poly.4 0.06213811
## PubDate.day.minutes.poly.4.ctg 0.49508199
## PubDate.day.minutes.poly.5 0.07601554
## PubDate.day.minutes.poly.5.ctg 0.48962874
## PubDate.last16.log1p 11.84854019
## PubDate.last16.log1p.ctg 15.72589745
## PubDate.last2.log1p.ctg 14.07883590
## PubDate.last32.log1p 12.17383350
## PubDate.last8.log1p 11.40150216
## WordCount.nexp 1.00000000
## max.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 0.02472285
## PubDate.day.minutes.poly.3 0.05182988
## PubDate.day.minutes.poly.3.ctg 0.43176828
## PubDate.day.minutes.poly.4 0.06610094
## PubDate.day.minutes.poly.4.ctg 0.45727441
## PubDate.day.minutes.poly.5 0.08344228
## PubDate.day.minutes.poly.5.ctg 0.41093522
## PubDate.last16.log1p 11.95698288
## PubDate.last16.log1p.ctg 15.77251197
## PubDate.last2.log1p.ctg 14.25350675
## PubDate.last32.log1p 12.21791228
## PubDate.last8.log1p 11.62246125
## WordCount.nexp 1.00000000
## min.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1 -0.02749464
## PubDate.day.minutes.poly.3 -0.04512497
## PubDate.day.minutes.poly.3.ctg -0.39431764
## PubDate.day.minutes.poly.4 -0.01832685
## PubDate.day.minutes.poly.4.ctg -0.16387098
## PubDate.day.minutes.poly.5 -0.02450918
## PubDate.day.minutes.poly.5.ctg -0.21141841
## PubDate.last16.log1p 0.00000000
## PubDate.last16.log1p.ctg 0.00000000
## PubDate.last2.log1p.ctg 0.00000000
## PubDate.last32.log1p 0.00000000
## PubDate.last8.log1p 0.00000000
## WordCount.nexp 0.00000000
## min.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 -0.02745833
## PubDate.day.minutes.poly.3 -0.04482024
## PubDate.day.minutes.poly.3.ctg -0.24211460
## PubDate.day.minutes.poly.4 -0.01824013
## PubDate.day.minutes.poly.4.ctg -0.51296095
## PubDate.day.minutes.poly.5 -0.02362780
## PubDate.day.minutes.poly.5.ctg -0.54313226
## PubDate.last16.log1p 0.00000000
## PubDate.last16.log1p.ctg 0.00000000
## PubDate.last2.log1p.ctg 0.00000000
## PubDate.last32.log1p 0.00000000
## PubDate.last8.log1p 7.01929665
## WordCount.nexp 0.00000000
## max.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1 0.02399660
## PubDate.day.minutes.poly.3 0.04558390
## PubDate.day.minutes.poly.3.ctg 0.56127224
## PubDate.day.minutes.poly.4 0.05340046
## PubDate.day.minutes.poly.4.ctg 0.63819571
## PubDate.day.minutes.poly.5 0.06013194
## PubDate.day.minutes.poly.5.ctg 0.45824974
## PubDate.last16.log1p 11.85180908
## PubDate.last16.log1p.ctg 15.68420514
## PubDate.last2.log1p.ctg 15.06116892
## PubDate.last32.log1p 12.32340669
## PubDate.last8.log1p 11.27955479
## WordCount.nexp 1.00000000
## max.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 0.02475916
## PubDate.day.minutes.poly.3 0.05215301
## PubDate.day.minutes.poly.3.ctg 0.34217266
## PubDate.day.minutes.poly.4 0.06677441
## PubDate.day.minutes.poly.4.ctg 0.38235412
## PubDate.day.minutes.poly.5 0.08471756
## PubDate.day.minutes.poly.5.ctg 0.42244492
## PubDate.last16.log1p 11.88113167
## PubDate.last16.log1p.ctg 15.67548989
## PubDate.last2.log1p.ctg 14.77515997
## PubDate.last32.log1p 12.30973422
## PubDate.last8.log1p 11.33227851
## WordCount.nexp 0.01831564
## min.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1 -0.02745833
## PubDate.day.minutes.poly.3 -0.04482024
## PubDate.day.minutes.poly.3.ctg -0.65503648
## PubDate.day.minutes.poly.4 -0.01832268
## PubDate.day.minutes.poly.4.ctg -0.23960642
## PubDate.day.minutes.poly.5 -0.02362780
## PubDate.day.minutes.poly.5.ctg -0.35475727
## PubDate.last16.log1p 8.10167775
## PubDate.last16.log1p.ctg 0.00000000
## PubDate.last2.log1p.ctg 0.00000000
## PubDate.last32.log1p 8.86290830
## PubDate.last8.log1p 7.06133437
## WordCount.nexp 0.00000000
## min.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 -0.02749464
## PubDate.day.minutes.poly.3 -0.04512497
## PubDate.day.minutes.poly.3.ctg -0.33854383
## PubDate.day.minutes.poly.4 -0.01820339
## PubDate.day.minutes.poly.4.ctg -0.20524473
## PubDate.day.minutes.poly.5 -0.02450918
## PubDate.day.minutes.poly.5.ctg -0.28096322
## PubDate.last16.log1p 8.09223941
## PubDate.last16.log1p.ctg 0.00000000
## PubDate.last2.log1p.ctg 0.00000000
## PubDate.last32.log1p 8.83579237
## PubDate.last8.log1p 6.89162590
## WordCount.nexp 0.00000000
## [1] "OOBobs Popular.fctr.All.X..rcv.glmnet N: min < min of Train range: 1"
## UniqueID Popular.fctr.All.X..rcv.glmnet
## 1833 1833 N
## PubDate.day.minutes.poly.1.ctg
## 1833 -0.7070114
## id cor.y
## PubDate.day.minutes.poly.1.ctg PubDate.day.minutes.poly.1.ctg -0.002432289
## exclude.as.feat cor.y.abs cor.high.X
## PubDate.day.minutes.poly.1.ctg FALSE 0.002432289 <NA>
## freqRatio percentUnique zeroVar nzv
## PubDate.day.minutes.poly.1.ctg 1.083333 53.96509 FALSE FALSE
## is.cor.y.abs.low interaction.feat
## PubDate.day.minutes.poly.1.ctg TRUE NDSSName.my.fctr
## shapiro.test.p.value rsp_var_raw id_var
## PubDate.day.minutes.poly.1.ctg 1.051535e-45 FALSE NA
## rsp_var max min
## PubDate.day.minutes.poly.1.ctg NA 0.4812771 -0.7070114
## max.Popular.fctr.N max.Popular.fctr.Y
## PubDate.day.minutes.poly.1.ctg 0.3596891 0.3295025
## min.Popular.fctr.N min.Popular.fctr.Y
## PubDate.day.minutes.poly.1.ctg -0.3997769 -0.2457038
## max.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg 0.4812771
## max.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg 0.197731
## min.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg -0.7070114
## min.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg -0.2842221
## max.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg 0.2260955
## max.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg 0.1964159
## min.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg -0.2546548
## min.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg -0.2531302
## [1] "OOBobs Popular.fctr.All.X..rcv.glmnet N: max > max of Train range: 2"
## UniqueID Popular.fctr.All.X..rcv.glmnet
## 4402 4402 N
## 1924 1924 N
## PubDate.day.minutes.poly.1.ctg PubDate.last16.log1p.ctg
## 4402 0.48127714 0.0000
## 1924 0.04318327 15.7259
## id cor.y
## PubDate.day.minutes.poly.1.ctg PubDate.day.minutes.poly.1.ctg -0.002432289
## PubDate.last16.log1p.ctg PubDate.last16.log1p.ctg 0.007783530
## exclude.as.feat cor.y.abs cor.high.X
## PubDate.day.minutes.poly.1.ctg FALSE 0.002432289 <NA>
## PubDate.last16.log1p.ctg FALSE 0.007783530 <NA>
## freqRatio percentUnique zeroVar nzv
## PubDate.day.minutes.poly.1.ctg 1.083333 53.96509 FALSE FALSE
## PubDate.last16.log1p.ctg 60.000000 95.17759 FALSE FALSE
## is.cor.y.abs.low interaction.feat
## PubDate.day.minutes.poly.1.ctg TRUE NDSSName.my.fctr
## PubDate.last16.log1p.ctg TRUE NDSSName.my.fctr
## shapiro.test.p.value rsp_var_raw id_var
## PubDate.day.minutes.poly.1.ctg 1.051535e-45 FALSE NA
## PubDate.last16.log1p.ctg 6.216597e-76 FALSE NA
## rsp_var max min
## PubDate.day.minutes.poly.1.ctg NA 0.4812771 -0.7070114
## PubDate.last16.log1p.ctg NA 15.7725120 0.0000000
## max.Popular.fctr.N max.Popular.fctr.Y
## PubDate.day.minutes.poly.1.ctg 0.3596891 0.3295025
## PubDate.last16.log1p.ctg 15.7203025 15.6295351
## min.Popular.fctr.N min.Popular.fctr.Y
## PubDate.day.minutes.poly.1.ctg -0.3997769 -0.2457038
## PubDate.last16.log1p.ctg 0.0000000 0.0000000
## max.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg 0.4812771
## PubDate.last16.log1p.ctg 15.7258974
## max.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg 0.197731
## PubDate.last16.log1p.ctg 15.772512
## min.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg -0.7070114
## PubDate.last16.log1p.ctg 0.0000000
## min.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg -0.2842221
## PubDate.last16.log1p.ctg 0.0000000
## max.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg 0.2260955
## PubDate.last16.log1p.ctg 15.6842051
## max.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg 0.1964159
## PubDate.last16.log1p.ctg 15.6754899
## min.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1.ctg -0.2546548
## PubDate.last16.log1p.ctg 0.0000000
## min.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1.ctg -0.2531302
## PubDate.last16.log1p.ctg 0.0000000
## [1] "OOBobs total range outliers: 25"
## [1] "newobs Popular.fctr.Final..rcv.glmnet N: max > max of Train range: 1152"
## UniqueID Popular.fctr.Final..rcv.glmnet
## 6533 6533 N
## 6541 6541 N
## 6542 6542 N
## 6543 6543 N
## 6545 6545 N
## 6546 6546 N
## PubDate.day.minutes.poly.3.ctg PubDate.juliandate
## 6533 0.013816963 335
## 6541 0.034723715 335
## 6542 0.037049157 335
## 6543 0.024918224 335
## 6545 -0.031573148 335
## 6546 0.009157074 335
## PubDate.last2.log1p.ctg PubDate.last32.log1p PubDate.last32.log1p.ctg
## 6533 11.020840 10.134321 13.09830
## 6541 10.856592 9.442800 13.12686
## 6542 11.034890 9.391411 13.12590
## 6543 9.618070 9.400878 12.81628
## 6545 9.879502 9.402860 12.99054
## 6546 8.798153 9.421978 12.95790
## PubDate.last4.log1p.ctg PubDate.last8.log1p.ctg
## 6533 11.253274 11.89583
## 6541 11.076186 11.28744
## 6542 11.184338 11.82491
## 6543 10.450423 11.38482
## 6545 9.953134 11.38843
## 6546 9.379408 11.28698
## UniqueID Popular.fctr.Final..rcv.glmnet
## 6634 6634 N
## 7497 7497 N
## 7589 7589 N
## 7726 7726 N
## 7914 7914 N
## 8236 8236 N
## PubDate.day.minutes.poly.3.ctg PubDate.juliandate
## 6634 -0.03207389 336
## 7497 0.10645902 347
## 7589 -0.06556391 349
## 7726 0.02210707 351
## 7914 -0.00890592 352
## 8236 0.04836251 360
## PubDate.last2.log1p.ctg PubDate.last32.log1p PubDate.last32.log1p.ctg
## 6634 9.645300 10.167581 13.03136
## 7497 7.440734 11.288156 11.75413
## 7589 10.784980 10.974420 11.97111
## 7726 8.187855 9.192278 11.97877
## 7914 10.960427 10.720753 13.12793
## 8236 11.370474 11.729246 0.00000
## PubDate.last4.log1p.ctg PubDate.last8.log1p.ctg
## 6634 10.030384 11.386126
## 7497 9.084777 9.869103
## 7589 10.857941 10.987223
## 7726 8.843471 11.078660
## 7914 11.133026 11.864969
## 8236 12.709290 13.274841
## UniqueID Popular.fctr.Final..rcv.glmnet
## 8394 8394 N
## 8395 8395 N
## 8396 8396 N
## 8398 8398 N
## 8400 8400 N
## 8401 8401 N
## PubDate.day.minutes.poly.3.ctg PubDate.juliandate
## 8394 -0.2772583 365
## 8395 0.1129868 365
## 8396 0.1587658 365
## 8398 -0.1431270 365
## 8400 -0.1380997 365
## 8401 0.1926449 365
## PubDate.last2.log1p.ctg PubDate.last32.log1p PubDate.last32.log1p.ctg
## 8394 13.13739 11.14224 15.14526
## 8395 11.44674 11.14428 15.14071
## 8396 13.98310 11.11651 15.39121
## 8398 6.43294 11.09410 15.90984
## 8400 15.06117 11.07855 15.92029
## 8401 11.51832 11.05398 15.91539
## PubDate.last4.log1p.ctg PubDate.last8.log1p.ctg
## 8394 13.41328 13.96451
## 8395 13.30663 13.83971
## 8396 14.06262 14.57832
## 8398 15.06135 15.37495
## 8400 15.35403 15.48083
## 8401 14.81295 15.38483
## id cor.y
## PubDate.day.minutes.poly.3.ctg PubDate.day.minutes.poly.3.ctg 0.014982807
## PubDate.juliandate PubDate.juliandate 0.014361075
## PubDate.last2.log1p.ctg PubDate.last2.log1p.ctg 0.006916600
## PubDate.last32.log1p PubDate.last32.log1p 0.003558081
## PubDate.last32.log1p.ctg PubDate.last32.log1p.ctg 0.015395971
## PubDate.last4.log1p.ctg PubDate.last4.log1p.ctg 0.004792781
## PubDate.last8.log1p.ctg PubDate.last8.log1p.ctg 0.003914960
## exclude.as.feat cor.y.abs
## PubDate.day.minutes.poly.3.ctg FALSE 0.014982807
## PubDate.juliandate FALSE 0.014361075
## PubDate.last2.log1p.ctg FALSE 0.006916600
## PubDate.last32.log1p FALSE 0.003558081
## PubDate.last32.log1p.ctg FALSE 0.015395971
## PubDate.last4.log1p.ctg FALSE 0.004792781
## PubDate.last8.log1p.ctg FALSE 0.003914960
## cor.high.X freqRatio percentUnique
## PubDate.day.minutes.poly.3.ctg <NA> 1.083333 53.965095
## PubDate.juliandate PubDate.month.fctr 1.032520 1.393141
## PubDate.last2.log1p.ctg <NA> 5.000000 92.192284
## PubDate.last32.log1p <NA> 8.000000 90.998163
## PubDate.last32.log1p.ctg <NA> 239.000000 92.115738
## PubDate.last4.log1p.ctg <NA> 20.000000 95.881813
## PubDate.last8.log1p.ctg <NA> 40.000000 96.417636
## zeroVar nzv is.cor.y.abs.low
## PubDate.day.minutes.poly.3.ctg FALSE FALSE FALSE
## PubDate.juliandate FALSE FALSE FALSE
## PubDate.last2.log1p.ctg FALSE FALSE TRUE
## PubDate.last32.log1p FALSE FALSE TRUE
## PubDate.last32.log1p.ctg FALSE FALSE FALSE
## PubDate.last4.log1p.ctg FALSE FALSE TRUE
## PubDate.last8.log1p.ctg FALSE FALSE TRUE
## interaction.feat shapiro.test.p.value
## PubDate.day.minutes.poly.3.ctg NDSSName.my.fctr 1.179915e-64
## PubDate.juliandate <NA> 1.389406e-35
## PubDate.last2.log1p.ctg NDSSName.my.fctr 1.991089e-37
## PubDate.last32.log1p <NA> 2.783236e-77
## PubDate.last32.log1p.ctg NDSSName.my.fctr 1.647772e-78
## PubDate.last4.log1p.ctg NDSSName.my.fctr 5.833827e-54
## PubDate.last8.log1p.ctg NDSSName.my.fctr 2.241558e-67
## rsp_var_raw id_var rsp_var max
## PubDate.day.minutes.poly.3.ctg FALSE NA NA 0.5612722
## PubDate.juliandate FALSE NA NA 365.0000000
## PubDate.last2.log1p.ctg FALSE NA NA 15.0611689
## PubDate.last32.log1p FALSE NA NA 12.3234067
## PubDate.last32.log1p.ctg FALSE NA NA 15.9202866
## PubDate.last4.log1p.ctg FALSE NA NA 15.3540272
## PubDate.last8.log1p.ctg FALSE NA NA 15.4808349
## min max.Popular.fctr.N
## PubDate.day.minutes.poly.3.ctg -0.6628317 0.5552844
## PubDate.juliandate 244.0000000 334.0000000
## PubDate.last2.log1p.ctg 0.0000000 14.7299941
## PubDate.last32.log1p 0.0000000 12.2124423
## PubDate.last32.log1p.ctg 0.0000000 15.3192168
## PubDate.last4.log1p.ctg 0.0000000 14.6582245
## PubDate.last8.log1p.ctg 0.0000000 15.1997598
## max.Popular.fctr.Y min.Popular.fctr.N
## PubDate.day.minutes.poly.3.ctg 0.363267 -0.6628317
## PubDate.juliandate 334.000000 244.0000000
## PubDate.last2.log1p.ctg 14.253507 0.0000000
## PubDate.last32.log1p 12.217912 0.0000000
## PubDate.last32.log1p.ctg 15.300332 0.0000000
## PubDate.last4.log1p.ctg 14.733865 0.0000000
## PubDate.last8.log1p.ctg 15.122845 0.0000000
## min.Popular.fctr.Y
## PubDate.day.minutes.poly.3.ctg -0.3828044
## PubDate.juliandate 244.0000000
## PubDate.last2.log1p.ctg 0.0000000
## PubDate.last32.log1p 0.0000000
## PubDate.last32.log1p.ctg 0.0000000
## PubDate.last4.log1p.ctg 0.0000000
## PubDate.last8.log1p.ctg 0.0000000
## max.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.3.ctg 0.4122857
## PubDate.juliandate 332.0000000
## PubDate.last2.log1p.ctg 14.0788359
## PubDate.last32.log1p 12.1738335
## PubDate.last32.log1p.ctg 15.2783647
## PubDate.last4.log1p.ctg 14.5891468
## PubDate.last8.log1p.ctg 15.1063837
## max.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.3.ctg 0.4317683
## PubDate.juliandate 334.0000000
## PubDate.last2.log1p.ctg 14.2535067
## PubDate.last32.log1p 12.2179123
## PubDate.last32.log1p.ctg 15.1773142
## PubDate.last4.log1p.ctg 14.5794661
## PubDate.last8.log1p.ctg 15.0339077
## min.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.3.ctg -0.3943176
## PubDate.juliandate 244.0000000
## PubDate.last2.log1p.ctg 0.0000000
## PubDate.last32.log1p 0.0000000
## PubDate.last32.log1p.ctg 0.0000000
## PubDate.last4.log1p.ctg 0.0000000
## PubDate.last8.log1p.ctg 0.0000000
## min.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.3.ctg -0.2421146
## PubDate.juliandate 244.0000000
## PubDate.last2.log1p.ctg 0.0000000
## PubDate.last32.log1p 0.0000000
## PubDate.last32.log1p.ctg 0.0000000
## PubDate.last4.log1p.ctg 0.0000000
## PubDate.last8.log1p.ctg 0.0000000
## max.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.3.ctg 0.5612722
## PubDate.juliandate 365.0000000
## PubDate.last2.log1p.ctg 15.0611689
## PubDate.last32.log1p 12.3234067
## PubDate.last32.log1p.ctg 15.9202866
## PubDate.last4.log1p.ctg 15.3540272
## PubDate.last8.log1p.ctg 15.4808349
## max.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.3.ctg 0.3421727
## PubDate.juliandate 365.0000000
## PubDate.last2.log1p.ctg 14.7751600
## PubDate.last32.log1p 12.3097342
## PubDate.last32.log1p.ctg 15.9193719
## PubDate.last4.log1p.ctg 15.3435742
## PubDate.last8.log1p.ctg 15.4661689
## min.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.3.ctg -0.6550365
## PubDate.juliandate 335.0000000
## PubDate.last2.log1p.ctg 0.0000000
## PubDate.last32.log1p 8.8629083
## PubDate.last32.log1p.ctg 0.0000000
## PubDate.last4.log1p.ctg 0.0000000
## PubDate.last8.log1p.ctg 0.0000000
## min.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.3.ctg -0.3385438
## PubDate.juliandate 335.0000000
## PubDate.last2.log1p.ctg 0.0000000
## PubDate.last32.log1p 8.8357924
## PubDate.last32.log1p.ctg 0.0000000
## PubDate.last4.log1p.ctg 0.0000000
## PubDate.last8.log1p.ctg 0.0000000
## [1] "newobs Popular.fctr.Final..rcv.glmnet Y: min < min of Train range: 4"
## UniqueID Popular.fctr.Final..rcv.glmnet PubDate.day.minutes.poly.1
## 7921 7921 Y -0.0274946397
## 8217 8217 Y 0.0110693210
## 8360 8360 Y 0.0119408230
## 8375 8375 Y -0.0006959551
## PubDate.day.minutes.poly.1.ctg PubDate.day.minutes.poly.3
## 7921 -0.03700426 -0.0451249711
## 8217 0.01104094 -0.0085868767
## 8360 -0.25313017 -0.0076596964
## 8375 -0.25313017 -0.0008261257
## PubDate.day.minutes.poly.5 WordCount.log1p WordCount.root2
## 7921 -0.024509181 6.304449 23.36664
## 8217 -0.001886744 1.609438 2.00000
## 8360 -0.005038384 7.202661 36.63332
## 8375 0.001230604 6.274762 23.02173
## id cor.y
## PubDate.day.minutes.poly.1 PubDate.day.minutes.poly.1 0.156753478
## PubDate.day.minutes.poly.1.ctg PubDate.day.minutes.poly.1.ctg -0.002432289
## PubDate.day.minutes.poly.3 PubDate.day.minutes.poly.3 0.027983551
## PubDate.day.minutes.poly.5 PubDate.day.minutes.poly.5 -0.055929231
## WordCount.log1p WordCount.log1p 0.254319628
## WordCount.root2 WordCount.root2 0.292120679
## exclude.as.feat cor.y.abs cor.high.X
## PubDate.day.minutes.poly.1 FALSE 0.156753478 <NA>
## PubDate.day.minutes.poly.1.ctg FALSE 0.002432289 <NA>
## PubDate.day.minutes.poly.3 FALSE 0.027983551 <NA>
## PubDate.day.minutes.poly.5 FALSE 0.055929231 <NA>
## WordCount.log1p FALSE 0.254319628 WordCount.root2
## WordCount.root2 FALSE 0.292120679 <NA>
## freqRatio percentUnique zeroVar nzv
## PubDate.day.minutes.poly.1 1.225490 18.08022 FALSE FALSE
## PubDate.day.minutes.poly.1.ctg 1.083333 53.96509 FALSE FALSE
## PubDate.day.minutes.poly.3 1.225490 18.08022 FALSE FALSE
## PubDate.day.minutes.poly.5 1.225490 18.08022 FALSE FALSE
## WordCount.log1p 2.315789 24.15799 FALSE FALSE
## WordCount.root2 2.315789 24.15799 FALSE FALSE
## is.cor.y.abs.low interaction.feat
## PubDate.day.minutes.poly.1 FALSE <NA>
## PubDate.day.minutes.poly.1.ctg TRUE NDSSName.my.fctr
## PubDate.day.minutes.poly.3 FALSE <NA>
## PubDate.day.minutes.poly.5 FALSE <NA>
## WordCount.log1p FALSE <NA>
## WordCount.root2 FALSE <NA>
## shapiro.test.p.value rsp_var_raw id_var
## PubDate.day.minutes.poly.1 1.590362e-18 FALSE NA
## PubDate.day.minutes.poly.1.ctg 1.051535e-45 FALSE NA
## PubDate.day.minutes.poly.3 9.822405e-52 FALSE NA
## PubDate.day.minutes.poly.5 1.157500e-41 FALSE NA
## WordCount.log1p 1.576866e-49 FALSE NA
## WordCount.root2 4.556481e-30 FALSE NA
## rsp_var max min
## PubDate.day.minutes.poly.1 NA 0.02475916 -0.02749464
## PubDate.day.minutes.poly.1.ctg NA 0.48127714 -0.70701144
## PubDate.day.minutes.poly.3 NA 0.05215301 -0.04512497
## PubDate.day.minutes.poly.5 NA 0.08471756 -0.02450918
## WordCount.log1p NA 9.29771002 0.00000000
## WordCount.root2 NA 104.46051886 0.00000000
## max.Popular.fctr.N max.Popular.fctr.Y
## PubDate.day.minutes.poly.1 0.02472285 0.02446866
## PubDate.day.minutes.poly.1.ctg 0.48127714 0.32950245
## PubDate.day.minutes.poly.3 0.05182988 0.04959703
## PubDate.day.minutes.poly.5 0.08344228 0.07481472
## WordCount.log1p 8.81966535 9.29771002
## WordCount.root2 82.24962006 104.46051886
## min.Popular.fctr.N min.Popular.fctr.Y
## PubDate.day.minutes.poly.1 -0.02749464 -0.02745833
## PubDate.day.minutes.poly.1.ctg -0.70701144 -0.24570380
## PubDate.day.minutes.poly.3 -0.04512497 -0.04482024
## PubDate.day.minutes.poly.5 -0.02450918 -0.02362780
## WordCount.log1p 0.00000000 1.94591015
## WordCount.root2 0.00000000 2.44948974
## max.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1 0.02450498
## PubDate.day.minutes.poly.1.ctg 0.48127714
## PubDate.day.minutes.poly.3 0.04991290
## PubDate.day.minutes.poly.5 0.07601554
## WordCount.log1p 7.05961763
## WordCount.root2 34.10278581
## max.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 0.02472285
## PubDate.day.minutes.poly.1.ctg 0.19773100
## PubDate.day.minutes.poly.3 0.05182988
## PubDate.day.minutes.poly.5 0.08344228
## WordCount.log1p 9.14088311
## WordCount.root2 96.58157174
## min.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1 -0.02749464
## PubDate.day.minutes.poly.1.ctg -0.70701144
## PubDate.day.minutes.poly.3 -0.04512497
## PubDate.day.minutes.poly.5 -0.02450918
## WordCount.log1p 0.00000000
## WordCount.root2 0.00000000
## min.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 -0.02745833
## PubDate.day.minutes.poly.1.ctg -0.28422208
## PubDate.day.minutes.poly.3 -0.04482024
## PubDate.day.minutes.poly.5 -0.02362780
## WordCount.log1p 0.00000000
## WordCount.root2 0.00000000
## max.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1 0.02399660
## PubDate.day.minutes.poly.1.ctg 0.22609552
## PubDate.day.minutes.poly.3 0.04558390
## PubDate.day.minutes.poly.5 0.06013194
## WordCount.log1p 7.61332498
## WordCount.root2 44.98888752
## max.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 0.02475916
## PubDate.day.minutes.poly.1.ctg 0.19641586
## PubDate.day.minutes.poly.3 0.05215301
## PubDate.day.minutes.poly.5 0.08471756
## WordCount.log1p 8.69232228
## WordCount.root2 77.17512553
## min.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1 -0.02745833
## PubDate.day.minutes.poly.1.ctg -0.25465485
## PubDate.day.minutes.poly.3 -0.04482024
## PubDate.day.minutes.poly.5 -0.02362780
## WordCount.log1p 0.00000000
## WordCount.root2 0.00000000
## min.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 -0.02749464
## PubDate.day.minutes.poly.1.ctg -0.25313017
## PubDate.day.minutes.poly.3 -0.04512497
## PubDate.day.minutes.poly.5 -0.02450918
## WordCount.log1p 1.60943791
## WordCount.root2 2.00000000
## [1] "newobs Popular.fctr.Final..rcv.glmnet Y: max > max of Train range: 718"
## UniqueID Popular.fctr.Final..rcv.glmnet PubDate.day.minutes.poly.1
## 6534 6534 Y 0.02047428
## 6535 6535 Y 0.02043797
## 6536 6536 Y 0.01840446
## 6537 6537 Y 0.01437377
## 6538 6538 Y 0.01408327
## 6539 6539 Y 0.01390170
## PubDate.day.minutes.poly.2 PubDate.day.minutes.poly.3
## 6534 0.024776665 0.020825101
## 6535 0.024663104 0.020614888
## 6536 0.018611080 0.010192448
## 6537 0.008400234 -0.003371562
## 6538 0.007755996 -0.004024505
## 6539 0.007359607 -0.004412350
## PubDate.day.minutes.poly.4 PubDate.day.minutes.poly.4.ctg
## 6534 0.010136126 -0.026117829
## 6535 0.009828175 -0.032426274
## 6536 -0.003724687 0.002579915
## 6537 -0.013710045 0.024407410
## 6538 -0.013788116 0.024419023
## 6539 -0.013803062 0.010731259
## PubDate.day.minutes.poly.5 PubDate.day.minutes.poly.5.ctg
## 6534 -0.004924177 -0.04109512
## 6535 -0.005269034 -0.03723150
## 6536 -0.017201046 0.02875121
## 6537 -0.013755146 0.01925208
## 6538 -0.012806242 -0.01676509
## 6539 -0.012191759 -0.02825414
## PubDate.juliandate PubDate.last16.log1p PubDate.last2.log1p.ctg
## 6534 335 9.732284 9.639001
## 6535 335 9.796793 9.896664
## 6536 335 9.590761 9.540219
## 6537 335 9.056023 9.362546
## 6538 335 8.998384 8.234830
## 6539 335 8.966356 7.949797
## PubDate.last32.log1p PubDate.last32.log1p.ctg PubDate.last4.log1p.ctg
## 6534 10.036094 13.03463 10.283942
## 6535 10.053458 13.04093 10.357965
## 6536 9.939434 13.01641 9.780020
## 6537 9.564863 13.10988 9.580386
## 6538 9.542733 13.09908 8.916506
## 6539 9.572898 13.11828 10.909784
## PubDate.last8.log1p.ctg WordCount.nexp
## 6534 10.51086 4.609768e-243
## 6535 11.36460 0.000000e+00
## 6536 11.33092 0.000000e+00
## 6537 11.27835 3.128062e-93
## 6538 11.28285 0.000000e+00
## 6539 11.26846 0.000000e+00
## UniqueID Popular.fctr.Final..rcv.glmnet PubDate.day.minutes.poly.1
## 6671 6671 Y 0.002463239
## 6867 6867 Y -0.001966895
## 7295 7295 Y -0.011335541
## 7514 7514 Y 0.008563753
## 7714 7714 Y 0.011178259
## 8136 8136 Y -0.005707091
## PubDate.day.minutes.poly.2 PubDate.day.minutes.poly.3
## 6671 -0.0079057889 -0.005335173
## 6867 -0.0086839125 0.001124609
## 7295 -0.0008881085 0.011076612
## 7514 -0.0021424925 -0.009708339
## 7714 0.0019915718 -0.008487221
## 8136 -0.0071090787 0.006558068
## PubDate.day.minutes.poly.4 PubDate.day.minutes.poly.4.ctg
## 6671 0.006293548 -0.03640140
## 6867 0.009647738 -0.01169510
## 7295 -0.005522885 0.01622926
## 7514 -0.006396495 0.07052515
## 7714 -0.011487039 0.02778502
## 8136 0.006705913 -0.02157952
## PubDate.day.minutes.poly.5 PubDate.day.minutes.poly.5.ctg
## 6671 0.007723689 -0.01480682
## 6867 -0.001918157 -0.07418258
## 7295 -0.007547158 0.05433712
## 7514 0.005596147 0.02094061
## 7714 -0.002271137 0.01200212
## 8136 -0.009755955 -0.04948684
## PubDate.juliandate PubDate.last16.log1p PubDate.last2.log1p.ctg
## 6671 336 9.199482 9.562616
## 6867 338 9.375855 11.241169
## 7295 344 10.686338 12.444416
## 7514 348 11.565043 8.592672
## 7714 351 8.776321 7.870930
## 8136 357 9.578657 8.386401
## PubDate.last32.log1p PubDate.last32.log1p.ctg PubDate.last4.log1p.ctg
## 6671 9.941120 13.21616 10.273429
## 6867 10.132891 14.06439 11.867988
## 7295 10.944100 14.20064 12.533821
## 7514 12.058442 12.04425 9.226804
## 7714 9.346792 12.05565 9.143452
## 8136 10.952787 12.59202 9.079890
## PubDate.last8.log1p.ctg WordCount.nexp
## 6671 12.056534 0.000000e+00
## 6867 12.282408 0.000000e+00
## 7295 12.969075 0.000000e+00
## 7514 9.797015 3.876168e-292
## 7714 11.174273 0.000000e+00
## 8136 10.165159 2.506567e-46
## UniqueID Popular.fctr.Final..rcv.glmnet PubDate.day.minutes.poly.1
## 8386 8386 Y -0.004762964
## 8391 8391 Y -0.007559033
## 8392 8392 Y -0.007885846
## 8397 8397 Y -0.012134418
## 8399 8399 Y -0.014204235
## 8402 8402 Y -0.027458327
## PubDate.day.minutes.poly.2 PubDate.day.minutes.poly.3
## 8386 -0.0076994040 0.005283252
## 8391 -0.0055729590 0.008724731
## 8392 -0.0052498766 0.009050092
## 8397 0.0003698638 0.011101808
## 8399 0.0040627987 0.010203360
## 8402 0.0425437738 -0.044820236
## PubDate.day.minutes.poly.4 PubDate.day.minutes.poly.4.ctg
## 8386 0.007937277 -0.194219443
## 8391 0.003454984 -0.129369992
## 8392 0.002780751 0.019130981
## 8397 -0.007579609 0.014839874
## 8399 -0.012625915 0.021396462
## 8402 0.033658267 0.001133258
## PubDate.day.minutes.poly.5 PubDate.day.minutes.poly.5.ctg
## 8386 -0.008201720 0.12356866
## 8391 -0.011437989 -0.06697667
## 8392 -0.011513182 0.15930790
## 8397 -0.005346258 -0.20117350
## 8399 0.002370447 0.16099960
## 8402 -0.023627798 -0.19528592
## PubDate.juliandate PubDate.last16.log1p PubDate.last2.log1p.ctg
## 8386 365 10.53223 12.64880
## 8391 365 10.97006 11.99075
## 8392 365 10.96164 13.18411
## 8397 365 10.91201 14.50789
## 8399 365 10.86224 14.77516
## 8402 365 10.46110 11.93860
## PubDate.last32.log1p PubDate.last32.log1p.ctg PubDate.last4.log1p.ctg
## 8386 11.19043 0.00000 13.40155
## 8391 11.15722 0.00000 12.95591
## 8392 11.15937 15.18202 13.66929
## 8397 11.11095 15.53790 14.55366
## 8399 11.10274 15.91937 15.34357
## 8402 10.79296 15.91787 11.94265
## PubDate.last8.log1p.ctg WordCount.nexp
## 8386 13.98196 0
## 8391 14.00877 0
## 8392 14.13371 0
## 8397 14.83106 0
## 8399 15.46617 0
## 8402 15.38649 0
## id cor.y
## PubDate.day.minutes.poly.1 PubDate.day.minutes.poly.1 0.156753478
## PubDate.day.minutes.poly.2 PubDate.day.minutes.poly.2 0.070977720
## PubDate.day.minutes.poly.3 PubDate.day.minutes.poly.3 0.027983551
## PubDate.day.minutes.poly.4 PubDate.day.minutes.poly.4 0.073941394
## PubDate.day.minutes.poly.4.ctg PubDate.day.minutes.poly.4.ctg 0.014601521
## PubDate.day.minutes.poly.5 PubDate.day.minutes.poly.5 -0.055929231
## PubDate.day.minutes.poly.5.ctg PubDate.day.minutes.poly.5.ctg 0.014574775
## PubDate.juliandate PubDate.juliandate 0.014361075
## PubDate.last16.log1p PubDate.last16.log1p 0.040735543
## PubDate.last2.log1p.ctg PubDate.last2.log1p.ctg 0.006916600
## PubDate.last32.log1p PubDate.last32.log1p 0.003558081
## PubDate.last32.log1p.ctg PubDate.last32.log1p.ctg 0.015395971
## PubDate.last4.log1p.ctg PubDate.last4.log1p.ctg 0.004792781
## PubDate.last8.log1p.ctg PubDate.last8.log1p.ctg 0.003914960
## WordCount.nexp WordCount.nexp -0.053208396
## exclude.as.feat cor.y.abs
## PubDate.day.minutes.poly.1 FALSE 0.156753478
## PubDate.day.minutes.poly.2 FALSE 0.070977720
## PubDate.day.minutes.poly.3 FALSE 0.027983551
## PubDate.day.minutes.poly.4 FALSE 0.073941394
## PubDate.day.minutes.poly.4.ctg FALSE 0.014601521
## PubDate.day.minutes.poly.5 FALSE 0.055929231
## PubDate.day.minutes.poly.5.ctg FALSE 0.014574775
## PubDate.juliandate FALSE 0.014361075
## PubDate.last16.log1p FALSE 0.040735543
## PubDate.last2.log1p.ctg FALSE 0.006916600
## PubDate.last32.log1p FALSE 0.003558081
## PubDate.last32.log1p.ctg FALSE 0.015395971
## PubDate.last4.log1p.ctg FALSE 0.004792781
## PubDate.last8.log1p.ctg FALSE 0.003914960
## WordCount.nexp FALSE 0.053208396
## cor.high.X freqRatio percentUnique
## PubDate.day.minutes.poly.1 <NA> 1.225490 18.080220
## PubDate.day.minutes.poly.2 <NA> 1.225490 18.080220
## PubDate.day.minutes.poly.3 <NA> 1.225490 18.080220
## PubDate.day.minutes.poly.4 <NA> 1.225490 18.080220
## PubDate.day.minutes.poly.4.ctg <NA> 1.083333 53.949786
## PubDate.day.minutes.poly.5 <NA> 1.225490 18.080220
## PubDate.day.minutes.poly.5.ctg <NA> 1.083333 53.949786
## PubDate.juliandate PubDate.month.fctr 1.032520 1.393141
## PubDate.last16.log1p <NA> 3.200000 84.445805
## PubDate.last2.log1p.ctg <NA> 5.000000 92.192284
## PubDate.last32.log1p <NA> 8.000000 90.998163
## PubDate.last32.log1p.ctg <NA> 239.000000 92.115738
## PubDate.last4.log1p.ctg <NA> 20.000000 95.881813
## PubDate.last8.log1p.ctg <NA> 40.000000 96.417636
## WordCount.nexp <NA> 17.761364 11.328843
## zeroVar nzv is.cor.y.abs.low
## PubDate.day.minutes.poly.1 FALSE FALSE FALSE
## PubDate.day.minutes.poly.2 FALSE FALSE FALSE
## PubDate.day.minutes.poly.3 FALSE FALSE FALSE
## PubDate.day.minutes.poly.4 FALSE FALSE FALSE
## PubDate.day.minutes.poly.4.ctg FALSE FALSE FALSE
## PubDate.day.minutes.poly.5 FALSE FALSE FALSE
## PubDate.day.minutes.poly.5.ctg FALSE FALSE FALSE
## PubDate.juliandate FALSE FALSE FALSE
## PubDate.last16.log1p FALSE FALSE FALSE
## PubDate.last2.log1p.ctg FALSE FALSE TRUE
## PubDate.last32.log1p FALSE FALSE TRUE
## PubDate.last32.log1p.ctg FALSE FALSE FALSE
## PubDate.last4.log1p.ctg FALSE FALSE TRUE
## PubDate.last8.log1p.ctg FALSE FALSE TRUE
## WordCount.nexp FALSE FALSE FALSE
## interaction.feat shapiro.test.p.value
## PubDate.day.minutes.poly.1 <NA> 1.590362e-18
## PubDate.day.minutes.poly.2 <NA> 8.020999e-64
## PubDate.day.minutes.poly.3 <NA> 9.822405e-52
## PubDate.day.minutes.poly.4 <NA> 1.523136e-47
## PubDate.day.minutes.poly.4.ctg NDSSName.my.fctr 2.214419e-67
## PubDate.day.minutes.poly.5 <NA> 1.157500e-41
## PubDate.day.minutes.poly.5.ctg NDSSName.my.fctr 7.171204e-67
## PubDate.juliandate <NA> 1.389406e-35
## PubDate.last16.log1p <NA> 7.310334e-68
## PubDate.last2.log1p.ctg NDSSName.my.fctr 1.991089e-37
## PubDate.last32.log1p <NA> 2.783236e-77
## PubDate.last32.log1p.ctg NDSSName.my.fctr 1.647772e-78
## PubDate.last4.log1p.ctg NDSSName.my.fctr 5.833827e-54
## PubDate.last8.log1p.ctg NDSSName.my.fctr 2.241558e-67
## WordCount.nexp <NA> 9.108805e-94
## rsp_var_raw id_var rsp_var max
## PubDate.day.minutes.poly.1 FALSE NA NA 0.02475916
## PubDate.day.minutes.poly.2 FALSE NA NA 0.04268445
## PubDate.day.minutes.poly.3 FALSE NA NA 0.05215301
## PubDate.day.minutes.poly.4 FALSE NA NA 0.06677441
## PubDate.day.minutes.poly.4.ctg FALSE NA NA 0.67700049
## PubDate.day.minutes.poly.5 FALSE NA NA 0.08471756
## PubDate.day.minutes.poly.5.ctg FALSE NA NA 0.56286316
## PubDate.juliandate FALSE NA NA 365.00000000
## PubDate.last16.log1p FALSE NA NA 11.95698288
## PubDate.last2.log1p.ctg FALSE NA NA 15.06116892
## PubDate.last32.log1p FALSE NA NA 12.32340669
## PubDate.last32.log1p.ctg FALSE NA NA 15.92028658
## PubDate.last4.log1p.ctg FALSE NA NA 15.35402717
## PubDate.last8.log1p.ctg FALSE NA NA 15.48083492
## WordCount.nexp FALSE NA NA 1.00000000
## min max.Popular.fctr.N
## PubDate.day.minutes.poly.1 -0.027494640 0.02472285
## PubDate.day.minutes.poly.2 -0.008758791 0.04268445
## PubDate.day.minutes.poly.3 -0.045124971 0.05182988
## PubDate.day.minutes.poly.4 -0.018327397 0.06610094
## PubDate.day.minutes.poly.4.ctg -0.611884133 0.67700049
## PubDate.day.minutes.poly.5 -0.024509181 0.08344228
## PubDate.day.minutes.poly.5.ctg -0.716534449 0.56286316
## PubDate.juliandate 244.000000000 334.00000000
## PubDate.last16.log1p 0.000000000 11.95698288
## PubDate.last2.log1p.ctg 0.000000000 14.72999406
## PubDate.last32.log1p 0.000000000 12.21244232
## PubDate.last32.log1p.ctg 0.000000000 15.31921677
## PubDate.last4.log1p.ctg 0.000000000 14.65822450
## PubDate.last8.log1p.ctg 0.000000000 15.19975983
## WordCount.nexp 0.000000000 1.00000000
## max.Popular.fctr.Y min.Popular.fctr.N
## PubDate.day.minutes.poly.1 2.446866e-02 -0.027494640
## PubDate.day.minutes.poly.2 4.254377e-02 -0.008758791
## PubDate.day.minutes.poly.3 4.959703e-02 -0.045124971
## PubDate.day.minutes.poly.4 6.149053e-02 -0.018327397
## PubDate.day.minutes.poly.4.ctg 3.330545e-01 -0.611884133
## PubDate.day.minutes.poly.5 7.481472e-02 -0.024509181
## PubDate.day.minutes.poly.5.ctg 2.158524e-01 -0.716534449
## PubDate.juliandate 3.340000e+02 244.000000000
## PubDate.last16.log1p 1.187760e+01 0.000000000
## PubDate.last2.log1p.ctg 1.425351e+01 0.000000000
## PubDate.last32.log1p 1.221791e+01 0.000000000
## PubDate.last32.log1p.ctg 1.530033e+01 0.000000000
## PubDate.last4.log1p.ctg 1.473386e+01 0.000000000
## PubDate.last8.log1p.ctg 1.512285e+01 0.000000000
## WordCount.nexp 2.478752e-03 0.000000000
## min.Popular.fctr.Y
## PubDate.day.minutes.poly.1 -0.027458327
## PubDate.day.minutes.poly.2 -0.008758791
## PubDate.day.minutes.poly.3 -0.044820236
## PubDate.day.minutes.poly.4 -0.018219595
## PubDate.day.minutes.poly.4.ctg -0.282432189
## PubDate.day.minutes.poly.5 -0.023627798
## PubDate.day.minutes.poly.5.ctg -0.370586479
## PubDate.juliandate 244.000000000
## PubDate.last16.log1p 0.000000000
## PubDate.last2.log1p.ctg 0.000000000
## PubDate.last32.log1p 0.000000000
## PubDate.last32.log1p.ctg 0.000000000
## PubDate.last4.log1p.ctg 0.000000000
## PubDate.last8.log1p.ctg 0.000000000
## WordCount.nexp 0.000000000
## max.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1 0.02450498
## PubDate.day.minutes.poly.2 0.04268445
## PubDate.day.minutes.poly.3 0.04991290
## PubDate.day.minutes.poly.4 0.06213811
## PubDate.day.minutes.poly.4.ctg 0.49508199
## PubDate.day.minutes.poly.5 0.07601554
## PubDate.day.minutes.poly.5.ctg 0.48962874
## PubDate.juliandate 332.00000000
## PubDate.last16.log1p 11.84854019
## PubDate.last2.log1p.ctg 14.07883590
## PubDate.last32.log1p 12.17383350
## PubDate.last32.log1p.ctg 15.27836472
## PubDate.last4.log1p.ctg 14.58914681
## PubDate.last8.log1p.ctg 15.10638373
## WordCount.nexp 1.00000000
## max.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 0.02472285
## PubDate.day.minutes.poly.2 0.04254377
## PubDate.day.minutes.poly.3 0.05182988
## PubDate.day.minutes.poly.4 0.06610094
## PubDate.day.minutes.poly.4.ctg 0.45727441
## PubDate.day.minutes.poly.5 0.08344228
## PubDate.day.minutes.poly.5.ctg 0.41093522
## PubDate.juliandate 334.00000000
## PubDate.last16.log1p 11.95698288
## PubDate.last2.log1p.ctg 14.25350675
## PubDate.last32.log1p 12.21791228
## PubDate.last32.log1p.ctg 15.17731420
## PubDate.last4.log1p.ctg 14.57946609
## PubDate.last8.log1p.ctg 15.03390773
## WordCount.nexp 1.00000000
## min.Popular.fctr.All.X..rcv.glmnet.N
## PubDate.day.minutes.poly.1 -0.027494640
## PubDate.day.minutes.poly.2 -0.008758672
## PubDate.day.minutes.poly.3 -0.045124971
## PubDate.day.minutes.poly.4 -0.018326850
## PubDate.day.minutes.poly.4.ctg -0.163870979
## PubDate.day.minutes.poly.5 -0.024509181
## PubDate.day.minutes.poly.5.ctg -0.211418410
## PubDate.juliandate 244.000000000
## PubDate.last16.log1p 0.000000000
## PubDate.last2.log1p.ctg 0.000000000
## PubDate.last32.log1p 0.000000000
## PubDate.last32.log1p.ctg 0.000000000
## PubDate.last4.log1p.ctg 0.000000000
## PubDate.last8.log1p.ctg 0.000000000
## WordCount.nexp 0.000000000
## min.Popular.fctr.All.X..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 -0.027458327
## PubDate.day.minutes.poly.2 -0.008758791
## PubDate.day.minutes.poly.3 -0.044820236
## PubDate.day.minutes.poly.4 -0.018240126
## PubDate.day.minutes.poly.4.ctg -0.512960949
## PubDate.day.minutes.poly.5 -0.023627798
## PubDate.day.minutes.poly.5.ctg -0.543132262
## PubDate.juliandate 244.000000000
## PubDate.last16.log1p 0.000000000
## PubDate.last2.log1p.ctg 0.000000000
## PubDate.last32.log1p 0.000000000
## PubDate.last32.log1p.ctg 0.000000000
## PubDate.last4.log1p.ctg 0.000000000
## PubDate.last8.log1p.ctg 0.000000000
## WordCount.nexp 0.000000000
## max.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1 0.02399660
## PubDate.day.minutes.poly.2 0.04254377
## PubDate.day.minutes.poly.3 0.04558390
## PubDate.day.minutes.poly.4 0.05340046
## PubDate.day.minutes.poly.4.ctg 0.63819571
## PubDate.day.minutes.poly.5 0.06013194
## PubDate.day.minutes.poly.5.ctg 0.45824974
## PubDate.juliandate 365.00000000
## PubDate.last16.log1p 11.85180908
## PubDate.last2.log1p.ctg 15.06116892
## PubDate.last32.log1p 12.32340669
## PubDate.last32.log1p.ctg 15.92028658
## PubDate.last4.log1p.ctg 15.35402717
## PubDate.last8.log1p.ctg 15.48083492
## WordCount.nexp 1.00000000
## max.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 0.02475916
## PubDate.day.minutes.poly.2 0.04268445
## PubDate.day.minutes.poly.3 0.05215301
## PubDate.day.minutes.poly.4 0.06677441
## PubDate.day.minutes.poly.4.ctg 0.38235412
## PubDate.day.minutes.poly.5 0.08471756
## PubDate.day.minutes.poly.5.ctg 0.42244492
## PubDate.juliandate 365.00000000
## PubDate.last16.log1p 11.88113167
## PubDate.last2.log1p.ctg 14.77515997
## PubDate.last32.log1p 12.30973422
## PubDate.last32.log1p.ctg 15.91937187
## PubDate.last4.log1p.ctg 15.34357419
## PubDate.last8.log1p.ctg 15.46616891
## WordCount.nexp 0.01831564
## min.Popular.fctr.Final..rcv.glmnet.N
## PubDate.day.minutes.poly.1 -0.027458327
## PubDate.day.minutes.poly.2 -0.008758791
## PubDate.day.minutes.poly.3 -0.044820236
## PubDate.day.minutes.poly.4 -0.018322678
## PubDate.day.minutes.poly.4.ctg -0.239606422
## PubDate.day.minutes.poly.5 -0.023627798
## PubDate.day.minutes.poly.5.ctg -0.354757272
## PubDate.juliandate 335.000000000
## PubDate.last16.log1p 8.101677747
## PubDate.last2.log1p.ctg 0.000000000
## PubDate.last32.log1p 8.862908295
## PubDate.last32.log1p.ctg 0.000000000
## PubDate.last4.log1p.ctg 0.000000000
## PubDate.last8.log1p.ctg 0.000000000
## WordCount.nexp 0.000000000
## min.Popular.fctr.Final..rcv.glmnet.Y
## PubDate.day.minutes.poly.1 -0.027494640
## PubDate.day.minutes.poly.2 -0.008758672
## PubDate.day.minutes.poly.3 -0.045124971
## PubDate.day.minutes.poly.4 -0.018203392
## PubDate.day.minutes.poly.4.ctg -0.205244731
## PubDate.day.minutes.poly.5 -0.024509181
## PubDate.day.minutes.poly.5.ctg -0.280963223
## PubDate.juliandate 335.000000000
## PubDate.last16.log1p 8.092239407
## PubDate.last2.log1p.ctg 0.000000000
## PubDate.last32.log1p 8.835792367
## PubDate.last32.log1p.ctg 0.000000000
## PubDate.last4.log1p.ctg 0.000000000
## PubDate.last8.log1p.ctg 0.000000000
## WordCount.nexp 0.000000000
## [1] "newobs total range outliers: 1870"
## numeric(0)
## [1] "glb_sel_mdl_id: All.X##rcv#glmnet"
## [1] "glb_fin_mdl_id: Final##rcv#glmnet"
## [1] "Cross Validation issues:"
## Warning in get_dsp_models_df(): Cross Validation issues:
## MFO###myMFO_classfr Random###myrandom_classfr
## 0 0
## Max.cor.Y.rcv.1X1###glmnet Max.cor.Y.rcv.1X1.cp.0###rpart
## 0 0
## id
## Max.cor.Y##rcv#rpart Max.cor.Y##rcv#rpart
## Max.cor.Y.Time.Lag##rcv#glmnet Max.cor.Y.Time.Lag##rcv#glmnet
## Low.cor.X##rcv#glmnet Low.cor.X##rcv#glmnet
## Max.cor.Y.Time.Poly##rcv#glmnet Max.cor.Y.Time.Poly##rcv#glmnet
## Max.cor.Y.rcv.1X1.cp.0###rpart Max.cor.Y.rcv.1X1.cp.0###rpart
## Interact.High.cor.Y##rcv#glmnet Interact.High.cor.Y##rcv#glmnet
## Max.cor.Y.rcv.1X1###glmnet Max.cor.Y.rcv.1X1###glmnet
## Max.cor.Y.rcv.5X3##rcv#glmnet Max.cor.Y.rcv.5X3##rcv#glmnet
## Max.cor.Y.rcv.5X1##rcv#glmnet Max.cor.Y.rcv.5X1##rcv#glmnet
## Max.cor.Y.rcv.5X5##rcv#glmnet Max.cor.Y.rcv.5X5##rcv#glmnet
## Max.cor.Y.rcv.3X1##rcv#glmnet Max.cor.Y.rcv.3X1##rcv#glmnet
## Max.cor.Y.rcv.3X3##rcv#glmnet Max.cor.Y.rcv.3X3##rcv#glmnet
## Max.cor.Y.rcv.3X5##rcv#glmnet Max.cor.Y.rcv.3X5##rcv#glmnet
## All.X##rcv#glmnet All.X##rcv#glmnet
## MFO###myMFO_classfr MFO###myMFO_classfr
## Random###myrandom_classfr Random###myrandom_classfr
## Final##rcv#glmnet Final##rcv#glmnet
## max.Accuracy.OOB max.AUCROCR.OOB
## Max.cor.Y##rcv#rpart 0.8200231 0.5892132
## Max.cor.Y.Time.Lag##rcv#glmnet 0.7818287 0.8024758
## Low.cor.X##rcv#glmnet 0.7783565 0.8052766
## Max.cor.Y.Time.Poly##rcv#glmnet 0.7754630 0.7997373
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.7673611 0.7773858
## Interact.High.cor.Y##rcv#glmnet 0.7656250 0.8140971
## Max.cor.Y.rcv.1X1###glmnet 0.7604167 0.8116126
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.7604167 0.8114863
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.7604167 0.8114863
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.7604167 0.8114863
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.7575231 0.8067975
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.7575231 0.8067975
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.7575231 0.8067975
## All.X##rcv#glmnet 0.6244213 0.8129999
## MFO###myMFO_classfr 0.1331019 0.5000000
## Random###myrandom_classfr 0.1331019 0.4857956
## Final##rcv#glmnet NA NA
## max.AUCpROC.OOB max.Accuracy.fit
## Max.cor.Y##rcv#rpart 0.5870523 0.9296422
## Max.cor.Y.Time.Lag##rcv#glmnet 0.5927265 0.9279769
## Low.cor.X##rcv#glmnet 0.5917252 0.9276303
## Max.cor.Y.Time.Poly##rcv#glmnet 0.5950717 0.9319320
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.6174697 0.9381765
## Interact.High.cor.Y##rcv#glmnet 0.6009259 0.9315850
## Max.cor.Y.rcv.1X1###glmnet 0.5962443 0.9329725
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.5962443 0.9333905
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.5962443 0.9331818
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.5962443 0.9331816
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.5962443 0.9335973
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.5962443 0.9333193
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.5962443 0.9332218
## All.X##rcv#glmnet 0.5873513 0.9233293
## MFO###myMFO_classfr 0.5000000 0.1796420
## Random###myrandom_classfr 0.5125675 0.1796420
## Final##rcv#glmnet NA 0.9060012
## opt.prob.threshold.fit
## Max.cor.Y##rcv#rpart 0.6
## Max.cor.Y.Time.Lag##rcv#glmnet 0.2
## Low.cor.X##rcv#glmnet 0.2
## Max.cor.Y.Time.Poly##rcv#glmnet 0.4
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.4
## Interact.High.cor.Y##rcv#glmnet 0.4
## Max.cor.Y.rcv.1X1###glmnet 0.5
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.5
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.5
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.5
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.4
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.4
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.4
## All.X##rcv#glmnet 0.3
## MFO###myMFO_classfr 0.1
## Random###myrandom_classfr 0.1
## Final##rcv#glmnet 0.3
## opt.prob.threshold.OOB
## Max.cor.Y##rcv#rpart 0.6
## Max.cor.Y.Time.Lag##rcv#glmnet 0.1
## Low.cor.X##rcv#glmnet 0.1
## Max.cor.Y.Time.Poly##rcv#glmnet 0.1
## Max.cor.Y.rcv.1X1.cp.0###rpart 0.1
## Interact.High.cor.Y##rcv#glmnet 0.1
## Max.cor.Y.rcv.1X1###glmnet 0.1
## Max.cor.Y.rcv.5X3##rcv#glmnet 0.1
## Max.cor.Y.rcv.5X1##rcv#glmnet 0.1
## Max.cor.Y.rcv.5X5##rcv#glmnet 0.1
## Max.cor.Y.rcv.3X1##rcv#glmnet 0.1
## Max.cor.Y.rcv.3X3##rcv#glmnet 0.1
## Max.cor.Y.rcv.3X5##rcv#glmnet 0.1
## All.X##rcv#glmnet 0.1
## MFO###myMFO_classfr 0.1
## Random###myrandom_classfr 0.1
## Final##rcv#glmnet NA
## [1] "All.X##rcv#glmnet OOB confusion matrix & accuracy: "
## Prediction
## Reference N Y
## N 874 624
## Y 25 205
## .freqRatio.Fit .freqRatio.OOB
## OpEd#Opinion# 0.090965862 0.0515046296
## #Opinion#ThePublicEditor 0.003330558 0.0023148148
## Styles#U.S.# 0.026436303 0.0289351852
## Business#Crosswords/Games# 0.021856786 0.0104166667
## Science#Health# 0.030807660 0.0277777778
## Business#Technology# 0.044338052 0.0729166667
## ## 0.190049958 0.2146990741
## Business#BusinessDay#Dealbook 0.130932556 0.1869212963
## Metro#N.Y./Region# 0.026644463 0.0405092593
## Culture#Arts# 0.101998335 0.1070601852
## #Opinion#RoomForDebate 0.008742714 0.0115740741
## Styles##Fashion 0.021648626 0.0086805556
## Business#BusinessDay#SmallBusiness 0.020815987 0.0231481481
## myOther 0.006869276 0.0028935185
## Travel#Travel# 0.017277269 0.0196759259
## Culture## NA 0.0005787037
## Foreign#World#AsiaPacific 0.031223980 0.0306712963
## #Multimedia# 0.019150708 0.0283564815
## TStyle## 0.129683597 0.0584490741
## #U.S.#Education 0.050582848 0.0474537037
## Foreign#World# 0.026644463 0.0254629630
## .freqRatio.Tst .n.Fit .n.New.N .n.New.Y
## OpEd#Opinion# 0.087700535 437 NA 164
## #Opinion#ThePublicEditor 0.005347594 16 NA 10
## Styles#U.S.# 0.032620321 127 NA 61
## Business#Crosswords/Games# 0.022459893 105 NA 42
## Science#Health# 0.030481283 148 NA 57
## Business#Technology# 0.060962567 213 45 69
## ## 0.182887701 913 270 72
## Business#BusinessDay#Dealbook 0.162566845 629 192 112
## Metro#N.Y./Region# 0.035828877 128 41 26
## Culture#Arts# 0.093048128 490 156 18
## #Opinion#RoomForDebate 0.010695187 42 9 11
## Styles##Fashion 0.008021390 104 14 1
## Business#BusinessDay#SmallBusiness 0.021925134 100 21 20
## myOther 0.002673797 33 4 1
## Travel#Travel# 0.018716578 83 35 NA
## Culture## 0.037433155 NA 51 19
## Foreign#World#AsiaPacific 0.029946524 150 41 15
## #Multimedia# 0.027807487 92 43 9
## TStyle## 0.056149733 623 101 4
## #U.S.#Education 0.047593583 243 83 6
## Foreign#World# 0.025133690 128 46 1
## .n.OOB .n.Trn.N .n.Trn.Y .n.Tst .n.fit
## OpEd#Opinion# 89 117 409 164 437
## #Opinion#ThePublicEditor 4 4 16 10 16
## Styles#U.S.# 50 77 100 61 127
## Business#Crosswords/Games# 18 20 103 42 105
## Science#Health# 48 74 122 57 148
## Business#Technology# 126 288 51 114 213
## ## 371 1169 115 342 913
## Business#BusinessDay#Dealbook 323 864 88 304 629
## Metro#N.Y./Region# 70 181 17 67 128
## Culture#Arts# 185 625 50 174 490
## #Opinion#RoomForDebate 20 61 1 20 42
## Styles##Fashion 15 118 1 15 104
## Business#BusinessDay#SmallBusiness 40 135 5 41 100
## myOther 5 38 NA 5 33
## Travel#Travel# 34 116 1 35 83
## Culture## 1 1 NA 70 NA
## Foreign#World#AsiaPacific 53 200 3 56 150
## #Multimedia# 49 139 2 52 92
## TStyle## 101 715 9 105 623
## #U.S.#Education 82 325 NA 89 243
## Foreign#World# 44 172 NA 47 128
## .n.new .n.trn err.abs.OOB.mean
## OpEd#Opinion# 164 526 0.52507201
## #Opinion#ThePublicEditor 10 20 0.48776373
## Styles#U.S.# 61 177 0.47240073
## Business#Crosswords/Games# 42 123 0.46577147
## Science#Health# 57 196 0.46246888
## Business#Technology# 114 339 0.23785179
## ## 342 1284 0.21029400
## Business#BusinessDay#Dealbook 304 952 0.20489878
## Metro#N.Y./Region# 67 198 0.19316339
## Culture#Arts# 174 675 0.18740433
## #Opinion#RoomForDebate 20 62 0.18687462
## Styles##Fashion 15 119 0.14396810
## Business#BusinessDay#SmallBusiness 41 140 0.14068302
## myOther 5 38 0.11350681
## Travel#Travel# 35 117 0.10662702
## Culture## 70 1 0.10281135
## Foreign#World#AsiaPacific 56 203 0.10071396
## #Multimedia# 52 141 0.09870894
## TStyle## 105 724 0.09414148
## #U.S.#Education 89 325 0.07336847
## Foreign#World# 47 172 0.07263139
## err.abs.fit.mean err.abs.new.mean
## OpEd#Opinion# 0.38754474 NA
## #Opinion#ThePublicEditor 0.44112438 NA
## Styles#U.S.# 0.49067998 NA
## Business#Crosswords/Games# 0.35892352 NA
## Science#Health# 0.45323523 NA
## Business#Technology# 0.21606359 NA
## ## 0.14544300 NA
## Business#BusinessDay#Dealbook 0.15387372 NA
## Metro#N.Y./Region# 0.15544946 NA
## Culture#Arts# 0.12294604 NA
## #Opinion#RoomForDebate 0.15563377 NA
## Styles##Fashion 0.08608630 NA
## Business#BusinessDay#SmallBusiness 0.13037114 NA
## myOther 0.11274386 NA
## Travel#Travel# 0.08189305 NA
## Culture## NA NA
## Foreign#World#AsiaPacific 0.10364742 NA
## #Multimedia# 0.09070339 NA
## TStyle## 0.06992473 NA
## #U.S.#Education 0.06363446 NA
## Foreign#World# 0.06981429 NA
## err.abs.trn.mean err.abs.OOB.sum
## OpEd#Opinion# 0.36112023 46.7314088
## #Opinion#ThePublicEditor 0.38900454 1.9510549
## Styles#U.S.# 0.47461795 23.6200366
## Business#Crosswords/Games# 0.29691537 8.3838865
## Science#Health# 0.41811251 22.1985061
## Business#Technology# 0.21494716 29.9693251
## ## 0.13761708 78.0190757
## Business#BusinessDay#Dealbook 0.16192176 66.1823064
## Metro#N.Y./Region# 0.14493692 13.5214374
## Culture#Arts# 0.12194793 34.6698010
## #Opinion#RoomForDebate 0.13434363 3.7374924
## Styles##Fashion 0.08404609 2.1595216
## Business#BusinessDay#SmallBusiness 0.12267434 5.6273209
## myOther 0.08938976 0.5675341
## Travel#Travel# 0.06217589 3.6253187
## Culture## 0.06323730 0.1028114
## Foreign#World#AsiaPacific 0.08200063 5.3378398
## #Multimedia# 0.07335717 4.8367383
## TStyle## 0.04963747 9.5082891
## #U.S.#Education 0.04324332 6.0162144
## Foreign#World# 0.04765510 3.1957814
## err.abs.fit.sum err.abs.new.sum
## OpEd#Opinion# 169.357052 NA
## #Opinion#ThePublicEditor 7.057990 NA
## Styles#U.S.# 62.316357 NA
## Business#Crosswords/Games# 37.686969 NA
## Science#Health# 67.078814 NA
## Business#Technology# 46.021544 NA
## ## 132.789459 NA
## Business#BusinessDay#Dealbook 96.786571 NA
## Metro#N.Y./Region# 19.897531 NA
## Culture#Arts# 60.243557 NA
## #Opinion#RoomForDebate 6.536618 NA
## Styles##Fashion 8.952975 NA
## Business#BusinessDay#SmallBusiness 13.037114 NA
## myOther 3.720547 NA
## Travel#Travel# 6.797123 NA
## Culture## NA NA
## Foreign#World#AsiaPacific 15.547113 NA
## #Multimedia# 8.344712 NA
## TStyle## 43.563105 NA
## #U.S.#Education 15.463174 NA
## Foreign#World# 8.936229 NA
## err.abs.trn.sum
## OpEd#Opinion# 189.9492434
## #Opinion#ThePublicEditor 7.7800907
## Styles#U.S.# 84.0073771
## Business#Crosswords/Games# 36.5205904
## Science#Health# 81.9500512
## Business#Technology# 72.8670876
## ## 176.7003249
## Business#BusinessDay#Dealbook 154.1495179
## Metro#N.Y./Region# 28.6975109
## Culture#Arts# 82.3148502
## #Opinion#RoomForDebate 8.3293051
## Styles##Fashion 10.0014851
## Business#BusinessDay#SmallBusiness 17.1744071
## myOther 3.3968110
## Travel#Travel# 7.2745787
## Culture## 0.0632373
## Foreign#World#AsiaPacific 16.6461272
## #Multimedia# 10.3433609
## TStyle## 35.9375293
## #U.S.#Education 14.0540806
## Foreign#World# 8.1966765
## .freqRatio.Fit .freqRatio.OOB .freqRatio.Tst .n.Fit
## NA 1.000000 1.000000 NA
## .n.New.N .n.New.Y .n.OOB .n.Trn.N
## NA NA 1728.000000 5439.000000
## .n.Trn.Y .n.Tst .n.fit .n.new
## NA 1870.000000 NA 1870.000000
## .n.trn err.abs.OOB.mean err.abs.fit.mean err.abs.new.mean
## 6532.000000 4.681124 NA NA
## err.abs.trn.mean err.abs.OOB.sum err.abs.fit.sum err.abs.new.sum
## 3.572902 369.961700 NA NA
## err.abs.trn.sum
## 1046.354243
## All.X__rcv_glmnet.imp
## PubDate.day.minutes.poly.1 100.00000
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg 63.88597
## PubDate.day.minutes.poly.4 59.18904
## NDSSName.my.fctrOpEd#Opinion# 47.21133
## NDSSName.my.fctrBusiness#Crosswords/Games# 45.38051
## NDSSName.my.fctrScience#Health# 43.69957
## PubDate.day.minutes.poly.2 43.63891
## NDSSName.my.fctr#Opinion#ThePublicEditor 43.52674
## NDSSName.my.fctrStyles#U.S.# 42.53063
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.5.ctg 41.77929
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.3.ctg 35.10334
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.1.ctg 34.04441
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.4.ctg 33.86620
## PubDate.wkend 30.50180
## WordCount.log1p 30.43597
## PubDate.hour.fctr(15.3,23] 29.64299
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.2.ctg 29.59778
## WordCount.root2 29.52473
## PubDate.last4.log1p 29.48100
## PubDate.last2.log1p 29.43840
## PubDate.last8.log1p 29.36106
## NDSSName.my.fctrBusiness#Technology# 29.35337
## .rnorm 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr##:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrCulture## 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrMetro#N.Y./Region# 29.35337
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.5.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.1.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.2.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.3.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.4.ctg 29.35337
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.5.ctg 29.35337
## PubDate.date.fctr(13,19] 29.35337
## PubDate.date.fctr(19,25] 29.35337
## PubDate.date.fctr(25,31] 29.35337
## PubDate.date.fctr(7,13] 29.35337
## PubDate.day.minutes.poly.3 29.35337
## PubDate.day.minutes.poly.5 29.35337
## PubDate.hour.fctr(7.67,15.3] 29.35337
## PubDate.juliandate 29.35337
## PubDate.last16.log1p 29.35337
## PubDate.last32.log1p 29.35337
## PubDate.minute.fctr(14.8,29.5] 29.35337
## PubDate.minute.fctr(29.5,44.2] 29.35337
## PubDate.minute.fctr(44.2,59.1] 29.35337
## PubDate.month.fctr10 29.35337
## PubDate.month.fctr11 29.35337
## PubDate.month.fctr12 29.35337
## PubDate.second.fctr(14.8,29.5] 29.35337
## PubDate.second.fctr(29.5,44.2] 29.35337
## PubDate.second.fctr(44.2,59.1] 29.35337
## PubDate.wkday.fctr1 29.35337
## PubDate.wkday.fctr2 29.35337
## PubDate.wkday.fctr3 29.35337
## PubDate.wkday.fctr4 29.35337
## PubDate.wkday.fctr5 29.35337
## PubDate.wkday.fctr6 29.35337
## WordCount.nexp 29.35337
## NDSSName.my.fctrmyOther 29.23532
## NDSSName.my.fctr#Multimedia# 28.98875
## NDSSName.my.fctrTravel#Travel# 28.38092
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness 28.28957
## NDSSName.my.fctrForeign#World# 28.20781
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook 28.13044
## NDSSName.my.fctrCulture#Arts# 28.06442
## NDSSName.my.fctrStyles##Fashion 27.37098
## NDSSName.my.fctr#U.S.#Education 27.32131
## NDSSName.my.fctrForeign#World#AsiaPacific 27.02315
## NDSSName.my.fctrTStyle## 26.26230
## NDSSName.my.fctr#Opinion#RoomForDebate 24.63959
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.1.ctg 24.23150
## imp
## PubDate.day.minutes.poly.1 100.00000
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg 53.53417
## PubDate.day.minutes.poly.4 66.02225
## NDSSName.my.fctrOpEd#Opinion# 60.56361
## NDSSName.my.fctrBusiness#Crosswords/Games# 58.80165
## NDSSName.my.fctrScience#Health# 55.81153
## PubDate.day.minutes.poly.2 80.90424
## NDSSName.my.fctr#Opinion#ThePublicEditor 56.22569
## NDSSName.my.fctrStyles#U.S.# 54.55137
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.4.ctg 42.90290
## PubDate.wkend 43.89547
## WordCount.log1p 43.54951
## PubDate.hour.fctr(15.3,23] 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.2.ctg 42.31219
## WordCount.root2 42.55927
## PubDate.last4.log1p 42.31219
## PubDate.last2.log1p 42.31219
## PubDate.last8.log1p 42.31219
## NDSSName.my.fctrBusiness#Technology# 43.62216
## .rnorm 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrCulture## 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region# 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.5.ctg 42.31219
## PubDate.date.fctr(13,19] 42.31219
## PubDate.date.fctr(19,25] 42.31219
## PubDate.date.fctr(25,31] 42.31219
## PubDate.date.fctr(7,13] 42.31219
## PubDate.day.minutes.poly.3 42.31219
## PubDate.day.minutes.poly.5 42.31219
## PubDate.hour.fctr(7.67,15.3] 42.31219
## PubDate.juliandate 42.31219
## PubDate.last16.log1p 42.31219
## PubDate.last32.log1p 42.31219
## PubDate.minute.fctr(14.8,29.5] 42.31219
## PubDate.minute.fctr(29.5,44.2] 42.31219
## PubDate.minute.fctr(44.2,59.1] 42.31219
## PubDate.month.fctr10 42.31219
## PubDate.month.fctr11 42.31219
## PubDate.month.fctr12 42.31219
## PubDate.second.fctr(14.8,29.5] 42.31219
## PubDate.second.fctr(29.5,44.2] 42.31219
## PubDate.second.fctr(44.2,59.1] 42.31219
## PubDate.wkday.fctr1 42.31219
## PubDate.wkday.fctr2 42.31219
## PubDate.wkday.fctr3 42.31219
## PubDate.wkday.fctr4 42.31219
## PubDate.wkday.fctr5 42.31219
## PubDate.wkday.fctr6 42.31219
## WordCount.nexp 42.31219
## NDSSName.my.fctrmyOther 42.31219
## NDSSName.my.fctr#Multimedia# 42.31219
## NDSSName.my.fctrTravel#Travel# 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness 42.31219
## NDSSName.my.fctrForeign#World# 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook 42.31219
## NDSSName.my.fctrCulture#Arts# 42.31219
## NDSSName.my.fctrStyles##Fashion 41.91391
## NDSSName.my.fctr#U.S.#Education 40.11307
## NDSSName.my.fctrForeign#World#AsiaPacific 39.97221
## NDSSName.my.fctrTStyle## 39.60135
## NDSSName.my.fctr#Opinion#RoomForDebate 35.43704
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.1.ctg 42.31219
## Final__rcv_glmnet.imp
## PubDate.day.minutes.poly.1 100.00000
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.3.ctg 53.53417
## PubDate.day.minutes.poly.4 66.02225
## NDSSName.my.fctrOpEd#Opinion# 60.56361
## NDSSName.my.fctrBusiness#Crosswords/Games# 58.80165
## NDSSName.my.fctrScience#Health# 55.81153
## PubDate.day.minutes.poly.2 80.90424
## NDSSName.my.fctr#Opinion#ThePublicEditor 56.22569
## NDSSName.my.fctrStyles#U.S.# 54.55137
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.4.ctg 42.90290
## PubDate.wkend 43.89547
## WordCount.log1p 43.54951
## PubDate.hour.fctr(15.3,23] 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.2.ctg 42.31219
## WordCount.root2 42.55927
## PubDate.last4.log1p 42.31219
## PubDate.last2.log1p 42.31219
## PubDate.last8.log1p 42.31219
## NDSSName.my.fctrBusiness#Technology# 43.62216
## .rnorm 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr##:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr#Multimedia#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr#Opinion#RoomForDebate:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr#Opinion#ThePublicEditor:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctr#U.S.#Education:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrBusiness#Crosswords/Games#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrBusiness#Technology#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrCulture## 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrCulture##:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrCulture#Arts#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrForeign#World#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrForeign#World#AsiaPacific:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region# 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrMetro#N.Y./Region#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrScience#Health#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrStyles##Fashion:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrStyles#U.S.#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrTStyle##:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrTravel#Travel#:PubDate.day.minutes.poly.5.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.1.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.2.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.3.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.4.ctg 42.31219
## NDSSName.my.fctrmyOther:PubDate.day.minutes.poly.5.ctg 42.31219
## PubDate.date.fctr(13,19] 42.31219
## PubDate.date.fctr(19,25] 42.31219
## PubDate.date.fctr(25,31] 42.31219
## PubDate.date.fctr(7,13] 42.31219
## PubDate.day.minutes.poly.3 42.31219
## PubDate.day.minutes.poly.5 42.31219
## PubDate.hour.fctr(7.67,15.3] 42.31219
## PubDate.juliandate 42.31219
## PubDate.last16.log1p 42.31219
## PubDate.last32.log1p 42.31219
## PubDate.minute.fctr(14.8,29.5] 42.31219
## PubDate.minute.fctr(29.5,44.2] 42.31219
## PubDate.minute.fctr(44.2,59.1] 42.31219
## PubDate.month.fctr10 42.31219
## PubDate.month.fctr11 42.31219
## PubDate.month.fctr12 42.31219
## PubDate.second.fctr(14.8,29.5] 42.31219
## PubDate.second.fctr(29.5,44.2] 42.31219
## PubDate.second.fctr(44.2,59.1] 42.31219
## PubDate.wkday.fctr1 42.31219
## PubDate.wkday.fctr2 42.31219
## PubDate.wkday.fctr3 42.31219
## PubDate.wkday.fctr4 42.31219
## PubDate.wkday.fctr5 42.31219
## PubDate.wkday.fctr6 42.31219
## WordCount.nexp 42.31219
## NDSSName.my.fctrmyOther 42.31219
## NDSSName.my.fctr#Multimedia# 42.31219
## NDSSName.my.fctrTravel#Travel# 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#SmallBusiness 42.31219
## NDSSName.my.fctrForeign#World# 42.31219
## NDSSName.my.fctrBusiness#BusinessDay#Dealbook 42.31219
## NDSSName.my.fctrCulture#Arts# 42.31219
## NDSSName.my.fctrStyles##Fashion 41.91391
## NDSSName.my.fctr#U.S.#Education 40.11307
## NDSSName.my.fctrForeign#World#AsiaPacific 39.97221
## NDSSName.my.fctrTStyle## 39.60135
## NDSSName.my.fctr#Opinion#RoomForDebate 35.43704
## NDSSName.my.fctrOpEd#Opinion#:PubDate.day.minutes.poly.1.ctg 42.31219
## [1] "glbObsNew prediction stats:"
##
## N Y
## 1152 718
## label step_major step_minor label_minor bgn end
## 16 predict.data.new 8 0 0 610.617 639.13
## 17 display.session.info 9 0 0 639.131 NA
## elapsed
## 16 28.514
## 17 NA
Null Hypothesis (\(\sf{H_{0}}\)): mpg is not impacted by am_fctr.
The variance by am_fctr appears to be independent. #{r q1, cache=FALSE} # print(t.test(subset(cars_df, am_fctr == "automatic")$mpg, # subset(cars_df, am_fctr == "manual")$mpg, # var.equal=FALSE)$conf) # We reject the null hypothesis i.e. we have evidence to conclude that am_fctr impacts mpg (95% confidence). Manual transmission is better for miles per gallon versus automatic transmission.
## label step_major step_minor label_minor bgn
## 10 fit.models 6 0 0 74.756
## 14 fit.data.training 7 0 0 449.661
## 11 fit.models 6 1 1 350.285
## 16 predict.data.new 8 0 0 610.617
## 12 fit.models 6 2 2 416.565
## 9 select.features 5 0 0 49.295
## 1 import.data 1 0 0 17.432
## 15 fit.data.training 7 1 1 595.944
## 5 extract.features 3 0 0 39.267
## 13 fit.models 6 3 3 442.685
## 2 inspect.data 2 0 0 34.360
## 8 partition.data.training 4 0 0 47.818
## 6 manage.missing.data 3 1 1 46.382
## 3 scrub.data 2 1 1 37.863
## 7 cluster.data 3 2 2 47.491
## 4 transform.data 2 2 2 38.949
## end elapsed duration
## 10 350.285 275.529 275.529
## 14 595.943 146.282 146.282
## 11 416.564 66.279 66.279
## 16 639.130 28.514 28.513
## 12 442.684 26.119 26.119
## 9 74.755 25.460 25.460
## 1 34.359 16.927 16.927
## 15 610.616 14.672 14.672
## 5 46.381 7.114 7.114
## 13 449.661 6.976 6.976
## 2 37.863 3.503 3.503
## 8 49.295 1.477 1.477
## 6 47.490 1.108 1.108
## 3 38.948 1.086 1.085
## 7 47.817 0.326 0.326
## 4 39.267 0.318 0.318
## [1] "Total Elapsed Time: 639.13 secs"